qemu-kvm mmio 源码分析

2022-07-31 约 3432 字预计阅读 7 分钟次阅读

源码版本

qemu v7.1.0-rc0
linux 5.15.58

mmio 流程引用我的自上一篇博客

qemu 声明一段 memory_region, 用作 MMIO, 但不会实际分配, 执行 kvm_set_phys_mem 注册到 KVM 的过程中会执行 memory_region_is_ram 来判断这段物理内存空间是否是 RAM 设备, 不是就会直接返回. 并不会调用 kvm_set_user_memory_region 来真正注册.
SeaBIOS 分配好设备 MMIO 的基址.
guestOS 第一次访问时触发 EPT violation(没有 EPT 页表, 类似缺页中断), VM Exit
kvm 创建一个 EPT 页表, 设置页表项特殊标志位(将 EPT 页表项低三位设置为 110, 可读可写但未分配, 显然是错误的, 会触发 EPT misconfig, 同时设置 spte 标志位表示是 MMIO). 由于访问的这段 GPA 并没有被 kvm_set_user_memory_region 真正的注册, 那么 KVM 会认为该段内存的 pfn 类型为 KVM_PFN_NOSLOT, 进而调用 set_mmio_spte 来设置该段地址对应到 spte, 而该函数中会判断 pfn 是否为 NOSLOT 标记以确认这段地址空间为 MMIO.
guestOS 再次访问时触发 EPT misconfig(由特殊标志位引起的), VM Exit, 退回到 KVM 内核, 然后 KVM 将事件转发给 qemu(共享内存)

kvm 处理 EPT misconfig

VM 读写 mmio 地址, 触发 EPT misconfig, 陷入 kvm, vmx_handle_exit->handle_ept_misconfig从 VMCS 中取出 VM 读写的 GPA, 判断保留为 spte, 执行handle_mmio_page_fault, 返回RET_PF_EMULATE, 接着调用x86_emulate_instruction, x86_emulate_instruction 判断为 MMIO 返回 0, 表示要回到用户空间执行, handle_ept_misconfig返回 0, 返回到 qemu, 原因是KVM_EXIT_MMIO, 让 qemu 执行 mmio 对应的回调函数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
	int ret = __vmx_handle_exit(vcpu, exit_fastpath);

	/*
	 * Exit to user space when bus lock detected to inform that there is
	 * a bus lock in guest.
	 */
	if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
		if (ret > 0)
			vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;

		vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
		return 0;
	}
	return ret;
}

__vmx_handle_exit 部分代码

1
2
else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
    return handle_ept_misconfig(vcpu);

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
	gpa_t gpa;
	if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
		return 1;
	/*
	 * A nested guest cannot optimize MMIO vmexits, because we have an
	 * nGPA here instead of the required GPA.
	 */
	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
	if (!is_guest_mode(vcpu) &&
	    !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
		trace_kvm_fast_mmio(gpa);
		return kvm_skip_emulated_instruction(vcpu);
	}
	return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
}

kvm 获取 gpa 后执行kvm_mmu_page_fault

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
		       void *insn, int insn_len)
{
	int r, emulation_type = EMULTYPE_PF;
	bool direct = vcpu->arch.mmu->direct_map;

	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
		return RET_PF_RETRY;

	r = RET_PF_INVALID;
	if (unlikely(error_code & PFERR_RSVD_MASK)) {
		r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
		if (r == RET_PF_EMULATE)
			goto emulate;
	}

	if (r == RET_PF_INVALID) {
		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
					  lower_32_bits(error_code), false);
		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
			return -EIO;
	}

	if (r < 0)
		return r;
	if (r != RET_PF_EMULATE)
		return 1;

	/*
	 * Before emulating the instruction, check if the error code
	 * was due to a RO violation while translating the guest page.
	 * This can occur when using nested virtualization with nested
	 * paging in both guests. If true, we simply unprotect the page
	 * and resume the guest.
	 */
	if (vcpu->arch.mmu->direct_map &&
	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
		return 1;
	}

	/*
	 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
	 * optimistically try to just unprotect the page and let the processor
	 * re-execute the instruction that caused the page fault.  Do not allow
	 * retrying MMIO emulation, as it's not only pointless but could also
	 * cause us to enter an infinite loop because the processor will keep
	 * faulting on the non-existent MMIO address.  Retrying an instruction
	 * from a nested guest is also pointless and dangerous as we are only
	 * explicitly shadowing L1's page tables, i.e. unprotecting something
	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
	 */
	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
		emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
emulate:
	return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
				       insn_len);
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
	u64 spte;
	bool reserved;

	if (mmio_info_in_cache(vcpu, addr, direct))
		return RET_PF_EMULATE;

	reserved = get_mmio_spte(vcpu, addr, &spte);
	if (WARN_ON(reserved))
		return -EINVAL;

	if (is_mmio_spte(spte)) {
		gfn_t gfn = get_mmio_spte_gfn(spte);
		unsigned int access = get_mmio_spte_access(spte);

		if (!check_mmio_spte(vcpu, spte))
			return RET_PF_INVALID;

		if (direct)
			addr = 0;

		trace_handle_mmio_page_fault(addr, gfn, access);
		vcpu_cache_mmio_info(vcpu, addr, gfn, access);
		return RET_PF_EMULATE;
	}

	/*
	 * If the page table is zapped by other cpus, let CPU fault again on
	 * the address.
	 */
	return RET_PF_RETRY;
}

x86_emulate_instruction 部分代码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
    ...
	} else if (vcpu->mmio_needed) {
		++vcpu->stat.mmio_exits;

		if (!vcpu->mmio_is_write)
			writeback = false;
		r = 0;
		vcpu->arch.complete_userspace_io = complete_emulated_mmio;
	}
    ...
    return r;

qemu 执行 mmio 读写的源码分析

qemu 中kvm_cpu_exec(CPUState *cpu) 中循环执行 kvm_vcpu_ioctl(cpu, KVM_RUN, 0), qemu 从run->exit_reason 中拿到退出原因KVM_EXIT_MMIO, 从与 kvm 的共享内存中拿到要读写的 MMIO 地址, 数据, 长度等信息, 执行 MMIO 的读写, 最终执行MemoryRegion中的读写回调.

MemoryRegion 结构体

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
struct MemoryRegion {
    Object parent_obj;

    /* private: */

    /* The following fields should fit in a cache line */
    bool romd_mode;
    bool ram;
    bool subpage;
    bool readonly; /* For RAM regions */
    bool nonvolatile;
    bool rom_device;
    bool flush_coalesced_mmio;
    uint8_t dirty_log_mask;
    bool is_iommu;
    RAMBlock *ram_block;
    Object *owner;

    const MemoryRegionOps *ops;
    void *opaque;
    MemoryRegion *container;
    int mapped_via_alias; /* Mapped via an alias, container might be NULL */
    Int128 size;
    hwaddr addr;
    void (*destructor)(MemoryRegion *mr);
    uint64_t align;
    bool terminates;
    bool ram_device;
    bool enabled;
    bool warning_printed; /* For reservations */
    uint8_t vga_logging_count;
    MemoryRegion *alias;
    hwaddr alias_offset;
    int32_t priority;
    QTAILQ_HEAD(, MemoryRegion) subregions;
    QTAILQ_ENTRY(MemoryRegion) subregions_link;
    QTAILQ_HEAD(, CoalescedMemoryRange) coalesced;
    const char *name;
    unsigned ioeventfd_nb;
    MemoryRegionIoeventfd *ioeventfds;
    RamDiscardManager *rdm; /* Only for RAM */
};

qemu 调用链如下

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
int kvm_cpu_exec(CPUState *cpu) {
    struct kvm_run *run = cpu->kvm_run;
    cpu_exec_start(cpu);
    do {
        MemTxAttrs attrs;
        kvm_arch_pre_run(cpu, run);
        run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
        attrs = kvm_arch_post_run(cpu, run);

        trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
        switch (run->exit_reason) {
        case KVM_EXIT_IO:
            DPRINTF("handle_io\n");
            /* Called outside BQL */
            kvm_handle_io(run->io.port, attrs,
                          (uint8_t *)run + run->io.data_offset,
                          run->io.direction,
                          run->io.size,
                          run->io.count);
            ret = 0;
            break;
        case KVM_EXIT_MMIO:
            DPRINTF("handle_mmio\n");
            /* Called outside BQL */
            address_space_rw(&address_space_memory,
                             run->mmio.phys_addr, attrs,
                             run->mmio.data,
                             run->mmio.len,
                             run->mmio.is_write);
            ret = 0;
            break;
        case ...
        // ...
    } while (ret == 0);

1
2
3
4
5
6
7
8
9
MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
                             void *buf, hwaddr len, bool is_write)
{
    if (is_write) {
        return address_space_write(as, addr, attrs, buf, len);
    } else {
        return address_space_read_full(as, addr, attrs, buf, len);
    }
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
                                MemTxAttrs attrs,
                                const void *buf, hwaddr len)
{
    MemTxResult result = MEMTX_OK;
    FlatView *fv;

    if (len > 0) {
        RCU_READ_LOCK_GUARD();
        fv = address_space_to_flatview(as);
        result = flatview_write(fv, addr, attrs, buf, len);
    }

    return result;
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
/* Called from RCU critical section.  */
static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
                                  const void *buf, hwaddr len)
{
    hwaddr l;
    hwaddr addr1;
    MemoryRegion *mr;

    l = len;
    mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
    if (!flatview_access_allowed(mr, attrs, addr, len)) {
        return MEMTX_ACCESS_ERROR;
    }
    return flatview_write_continue(fv, addr, attrs, buf, len,
                                   addr1, l, mr);
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* Called within RCU critical section.  */
static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
                                           MemTxAttrs attrs,
                                           const void *ptr,
                                           hwaddr len, hwaddr addr1,
                                           hwaddr l, MemoryRegion *mr)
{
    uint8_t *ram_ptr;
    uint64_t val;
    MemTxResult result = MEMTX_OK;
    bool release_lock = false;
    const uint8_t *buf = ptr;

    for (;;) {
        if (!flatview_access_allowed(mr, attrs, addr1, l)) {
            result |= MEMTX_ACCESS_ERROR;
            /* Keep going. */
        } else if (!memory_access_is_direct(mr, true)) {
            release_lock |= prepare_mmio_access(mr);
            l = memory_access_size(mr, l, addr1);
            /* XXX: could force current_cpu to NULL to avoid
               potential bugs */
            val = ldn_he_p(buf, l);
            result |= memory_region_dispatch_write(mr, addr1, val,
                                                   size_memop(l), attrs);
        } else {
            /* RAM case */
            ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
            memcpy(ram_ptr, buf, l);
            invalidate_and_set_dirty(mr, addr1, l);
        }

        if (release_lock) {
            qemu_mutex_unlock_iothread();
            release_lock = false;
        }

        len -= l;
        buf += l;
        addr += l;

        if (!len) {
            break;
        }

        l = len;
        mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
    }

    return result;
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
bool prepare_mmio_access(MemoryRegion *mr)
{
    bool release_lock = false;

    if (!qemu_mutex_iothread_locked()) {
        qemu_mutex_lock_iothread();
        release_lock = true;
    }
    if (mr->flush_coalesced_mmio) {
        qemu_flush_coalesced_mmio_buffer();
    }

    return release_lock;
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
MemTxResult memory_region_dispatch_write(MemoryRegion *mr,
                                         hwaddr addr,
                                         uint64_t data,
                                         MemOp op,
                                         MemTxAttrs attrs)
{
    unsigned size = memop_size(op);

    if (mr->alias) {
        return memory_region_dispatch_write(mr->alias,
                                            mr->alias_offset + addr,
                                            data, op, attrs);
    }
    if (!memory_region_access_valid(mr, addr, size, true, attrs)) {
        unassigned_mem_write(mr, addr, data, size);
        return MEMTX_DECODE_ERROR;
    }

    adjust_endianness(mr, &data, op);

    if ((!kvm_eventfds_enabled()) &&
        memory_region_dispatch_write_eventfds(mr, addr, data, size, attrs)) {
        return MEMTX_OK;
    }

    if (mr->ops->write) {
        return access_with_adjusted_size(addr, &data, size,
                                         mr->ops->impl.min_access_size,
                                         mr->ops->impl.max_access_size,
                                         memory_region_write_accessor, mr,
                                         attrs);
    } else {
        return
            access_with_adjusted_size(addr, &data, size,
                                      mr->ops->impl.min_access_size,
                                      mr->ops->impl.max_access_size,
                                      memory_region_write_with_attrs_accessor,
                                      mr, attrs);
    }
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
static MemTxResult access_with_adjusted_size(hwaddr addr,
                                      uint64_t *value,
                                      unsigned size,
                                      unsigned access_size_min,
                                      unsigned access_size_max,
                                      MemTxResult (*access_fn)
                                                  (MemoryRegion *mr,
                                                   hwaddr addr,
                                                   uint64_t *value,
                                                   unsigned size,
                                                   signed shift,
                                                   uint64_t mask,
                                                   MemTxAttrs attrs),
                                      MemoryRegion *mr,
                                      MemTxAttrs attrs)
{
    uint64_t access_mask;
    unsigned access_size;
    unsigned i;
    MemTxResult r = MEMTX_OK;

    if (!access_size_min) {
        access_size_min = 1;
    }
    if (!access_size_max) {
        access_size_max = 4;
    }

    /* FIXME: support unaligned access? */
    access_size = MAX(MIN(size, access_size_max), access_size_min);
    access_mask = MAKE_64BIT_MASK(0, access_size * 8);
    if (memory_region_big_endian(mr)) {
        for (i = 0; i < size; i += access_size) {
            r |= access_fn(mr, addr + i, value, access_size,
                        (size - access_size - i) * 8, access_mask, attrs);
        }
    } else {
        for (i = 0; i < size; i += access_size) {
            r |= access_fn(mr, addr + i, value, access_size, i * 8,
                        access_mask, attrs);
        }
    }
    return r;
}

access_fn 回调 -> memory_region_write_accessor

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
static MemTxResult memory_region_write_accessor(MemoryRegion *mr,
                                                hwaddr addr,
                                                uint64_t *value,
                                                unsigned size,
                                                signed shift,
                                                uint64_t mask,
                                                MemTxAttrs attrs)
{
    uint64_t tmp = memory_region_shift_write_access(value, shift, mask);

    if (mr->subpage) {
        trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size);
    } else if (trace_event_get_state_backends(TRACE_MEMORY_REGION_OPS_WRITE)) {
        hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
        trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size,
                                      memory_region_name(mr));
    }
    mr->ops->write(mr->opaque, addr, tmp, size);
    return MEMTX_OK;
}

执行回调

1
mr->ops->write(mr->opaque, addr, tmp, size);

回调是创建 mmio 一开始就设置好的, 以 virtio-mmio 为例

初始化回调

1
2
3
4
5
static const MemoryRegionOps virtio_mem_ops = {
    .read = virtio_mmio_read,
    .write = virtio_mmio_write,
    .endianness = DEVICE_LITTLE_ENDIAN,
};

注册回调

1
2
3
memory_region_init_io(&proxy->iomem, OBJECT(d),
                      &virtio_mem_ops, proxy,
                      TYPE_VIRTIO_MMIO, 0x200);

以写回调为例

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
static void virtio_mmio_write(void *opaque, hwaddr offset, uint64_t value,
                              unsigned size)
{
    VirtIOMMIOProxy *proxy = (VirtIOMMIOProxy *)opaque;
    VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);

    trace_virtio_mmio_write_offset(offset, value);

    if (!vdev) {
        /* If no backend is present, we just make all registers
         * write-ignored. This allows us to provide transports with
         * no backend plugged in.
         */
        return;
    }

    if (offset >= VIRTIO_MMIO_CONFIG) {
        offset -= VIRTIO_MMIO_CONFIG;
        if (proxy->legacy) {
            switch (size) {
            case 1:
                virtio_config_writeb(vdev, offset, value);
                break;
            case 2:
                virtio_config_writew(vdev, offset, value);
                break;
            case 4:
                virtio_config_writel(vdev, offset, value);
                break;
            default:
                abort();
            }
            return;
        } else {
            switch (size) {
            case 1:
                virtio_config_modern_writeb(vdev, offset, value);
                break;
            case 2:
                virtio_config_modern_writew(vdev, offset, value);
                break;
            case 4:
                virtio_config_modern_writel(vdev, offset, value);
                break;
            default:
                abort();
            }
            return;
        }
    }
    if (size != 4) {
        qemu_log_mask(LOG_GUEST_ERROR,
                      "%s: wrong size access to register!\n",
                      __func__);
        return;
    }
    switch (offset) {
    case VIRTIO_MMIO_DEVICE_FEATURES_SEL:
        if (value) {
            proxy->host_features_sel = 1;
        } else {
            proxy->host_features_sel = 0;
        }
        break;
    case VIRTIO_MMIO_DRIVER_FEATURES:
        if (proxy->legacy) {
            if (proxy->guest_features_sel) {
                qemu_log_mask(LOG_GUEST_ERROR,
                              "%s: attempt to write guest features with "
                              "guest_features_sel > 0 in legacy mode\n",
                              __func__);
            } else {
                virtio_set_features(vdev, value);
            }
        } else {
            proxy->guest_features[proxy->guest_features_sel] = value;
        }
        break;
    case VIRTIO_MMIO_DRIVER_FEATURES_SEL:
        if (value) {
            proxy->guest_features_sel = 1;
        } else {
            proxy->guest_features_sel = 0;
        }
        break;
    case VIRTIO_MMIO_GUEST_PAGE_SIZE:
        if (!proxy->legacy) {
            qemu_log_mask(LOG_GUEST_ERROR,
                          "%s: write to legacy register (0x%"
                          HWADDR_PRIx ") in non-legacy mode\n",
                          __func__, offset);
            return;
        }
        proxy->guest_page_shift = ctz32(value);
        if (proxy->guest_page_shift > 31) {
            proxy->guest_page_shift = 0;
        }
        trace_virtio_mmio_guest_page(value, proxy->guest_page_shift);
        break;
    case VIRTIO_MMIO_QUEUE_SEL:
        if (value < VIRTIO_QUEUE_MAX) {
            vdev->queue_sel = value;
        }
        break;
    case VIRTIO_MMIO_QUEUE_NUM:
        trace_virtio_mmio_queue_write(value, VIRTQUEUE_MAX_SIZE);
        virtio_queue_set_num(vdev, vdev->queue_sel, value);

        if (proxy->legacy) {
            virtio_queue_update_rings(vdev, vdev->queue_sel);
        } else {
            proxy->vqs[vdev->queue_sel].num = value;
        }
        break;
    case VIRTIO_MMIO_QUEUE_ALIGN:
        if (!proxy->legacy) {
            qemu_log_mask(LOG_GUEST_ERROR,
                          "%s: write to legacy register (0x%"
                          HWADDR_PRIx ") in non-legacy mode\n",
                          __func__, offset);
            return;
        }
        virtio_queue_set_align(vdev, vdev->queue_sel, value);
        break;
    case VIRTIO_MMIO_QUEUE_PFN:
        if (!proxy->legacy) {
            qemu_log_mask(LOG_GUEST_ERROR,
                          "%s: write to legacy register (0x%"
                          HWADDR_PRIx ") in non-legacy mode\n",
                          __func__, offset);
            return;
        }
        if (value == 0) {
            virtio_mmio_soft_reset(proxy);
        } else {
            virtio_queue_set_addr(vdev, vdev->queue_sel,
                                  value << proxy->guest_page_shift);
        }
        break;
    case VIRTIO_MMIO_QUEUE_READY:
        if (proxy->legacy) {
            qemu_log_mask(LOG_GUEST_ERROR,
                          "%s: write to non-legacy register (0x%"
                          HWADDR_PRIx ") in legacy mode\n",
                          __func__, offset);
            return;
        }
        if (value) {
            virtio_queue_set_num(vdev, vdev->queue_sel,
                                 proxy->vqs[vdev->queue_sel].num);
            virtio_queue_set_rings(vdev, vdev->queue_sel,
                ((uint64_t)proxy->vqs[vdev->queue_sel].desc[1]) << 32 |
                proxy->vqs[vdev->queue_sel].desc[0],
                ((uint64_t)proxy->vqs[vdev->queue_sel].avail[1]) << 32 |
                proxy->vqs[vdev->queue_sel].avail[0],
                ((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 |
                proxy->vqs[vdev->queue_sel].used[0]);
            proxy->vqs[vdev->queue_sel].enabled = 1;
        } else {
            proxy->vqs[vdev->queue_sel].enabled = 0;
        }
        break;
    case VIRTIO_MMIO_QUEUE_NOTIFY:
        if (value < VIRTIO_QUEUE_MAX) {
            virtio_queue_notify(vdev, value);
        }
        break;
    case VIRTIO_MMIO_INTERRUPT_ACK:
        qatomic_and(&vdev->isr, ~value);
        virtio_update_irq(vdev);
        break;
    case VIRTIO_MMIO_STATUS:
        if (!(value & VIRTIO_CONFIG_S_DRIVER_OK)) {
            virtio_mmio_stop_ioeventfd(proxy);
        }

        if (!proxy->legacy && (value & VIRTIO_CONFIG_S_FEATURES_OK)) {
            virtio_set_features(vdev,
                                ((uint64_t)proxy->guest_features[1]) << 32 |
                                proxy->guest_features[0]);
        }

        virtio_set_status(vdev, value & 0xff);

        if (value & VIRTIO_CONFIG_S_DRIVER_OK) {
            virtio_mmio_start_ioeventfd(proxy);
        }

        if (vdev->status == 0) {
            virtio_mmio_soft_reset(proxy);
        }
        break;
    case VIRTIO_MMIO_QUEUE_DESC_LOW:
        if (proxy->legacy) {
            qemu_log_mask(LOG_GUEST_ERROR,
                          "%s: write to non-legacy register (0x%"
                          HWADDR_PRIx ") in legacy mode\n",
                          __func__, offset);
            return;
        }
        proxy->vqs[vdev->queue_sel].desc[0] = value;
        break;
    case VIRTIO_MMIO_QUEUE_DESC_HIGH:
        if (proxy->legacy) {
            qemu_log_mask(LOG_GUEST_ERROR,
                          "%s: write to non-legacy register (0x%"
                          HWADDR_PRIx ") in legacy mode\n",
                          __func__, offset);
            return;
        }
        proxy->vqs[vdev->queue_sel].desc[1] = value;
        break;
    case VIRTIO_MMIO_QUEUE_AVAIL_LOW:
        if (proxy->legacy) {
            qemu_log_mask(LOG_GUEST_ERROR,
                          "%s: write to non-legacy register (0x%"
                          HWADDR_PRIx ") in legacy mode\n",
                          __func__, offset);
            return;
        }
        proxy->vqs[vdev->queue_sel].avail[0] = value;
        break;
    case VIRTIO_MMIO_QUEUE_AVAIL_HIGH:
        if (proxy->legacy) {
            qemu_log_mask(LOG_GUEST_ERROR,
                          "%s: write to non-legacy register (0x%"
                          HWADDR_PRIx ") in legacy mode\n",
                          __func__, offset);
            return;
        }
        proxy->vqs[vdev->queue_sel].avail[1] = value;
        break;
    case VIRTIO_MMIO_QUEUE_USED_LOW:
        if (proxy->legacy) {
            qemu_log_mask(LOG_GUEST_ERROR,
                          "%s: write to non-legacy register (0x%"
                          HWADDR_PRIx ") in legacy mode\n",
                          __func__, offset);
            return;
        }
        proxy->vqs[vdev->queue_sel].used[0] = value;
        break;
    case VIRTIO_MMIO_QUEUE_USED_HIGH:
        if (proxy->legacy) {
            qemu_log_mask(LOG_GUEST_ERROR,
                          "%s: write to non-legacy register (0x%"
                          HWADDR_PRIx ") in legacy mode\n",
                          __func__, offset);
            return;
        }
        proxy->vqs[vdev->queue_sel].used[1] = value;
        break;
    case VIRTIO_MMIO_MAGIC_VALUE:
    case VIRTIO_MMIO_VERSION:
    case VIRTIO_MMIO_DEVICE_ID:
    case VIRTIO_MMIO_VENDOR_ID:
    case VIRTIO_MMIO_DEVICE_FEATURES:
    case VIRTIO_MMIO_QUEUE_NUM_MAX:
    case VIRTIO_MMIO_INTERRUPT_STATUS:
    case VIRTIO_MMIO_CONFIG_GENERATION:
        qemu_log_mask(LOG_GUEST_ERROR,
                      "%s: write to read-only register (0x%" HWADDR_PRIx ")\n",
                      __func__, offset);
        break;

    default:
        qemu_log_mask(LOG_GUEST_ERROR,
                      "%s: bad register offset (0x%" HWADDR_PRIx ")\n",
                      __func__, offset);
    }
}

目录

qemu-kvm mmio 源码分析

mmio 流程 引用我的自上一篇博客

kvm 处理 EPT misconfig

qemu 执行 mmio 读写的源码分析

mmio 流程引用我的自上一篇博客