/* * qemu/kvm integration * * Copyright (C) 2006-2008 Qumranet Technologies * * Licensed under the terms of the GNU GPL version 2 or higher. */ #include "config.h" #include "config-host.h" #include #include #include "hw/hw.h" #include "sysemu.h" #include "qemu-common.h" #include "console.h" #include "block.h" #include "compatfd.h" #include "gdbstub.h" #include "monitor.h" #include "qemu-kvm.h" #include "libkvm.h" #include #include #include #include #include #include "compatfd.h" #include #define false 0 #define true 1 #ifndef PR_MCE_KILL #define PR_MCE_KILL 33 #endif #ifndef BUS_MCEERR_AR #define BUS_MCEERR_AR 4 #endif #ifndef BUS_MCEERR_AO #define BUS_MCEERR_AO 5 #endif #define EXPECTED_KVM_API_VERSION 12 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION #error libkvm: userspace and kernel version mismatch #endif int kvm_allowed = 1; int kvm_irqchip = 1; int kvm_pit = 1; int kvm_pit_reinject = 1; int kvm_nested = 0; KVMState *kvm_state; kvm_context_t kvm_context; pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER; pthread_cond_t qemu_vcpu_cond = PTHREAD_COND_INITIALIZER; pthread_cond_t qemu_system_cond = PTHREAD_COND_INITIALIZER; pthread_cond_t qemu_pause_cond = PTHREAD_COND_INITIALIZER; pthread_cond_t qemu_work_cond = PTHREAD_COND_INITIALIZER; __thread CPUState *current_env; static int qemu_system_ready; #define SIG_IPI (SIGRTMIN+4) pthread_t io_thread; static int io_thread_fd = -1; static int io_thread_sigfd = -1; static CPUState *kvm_debug_cpu_requested; static uint64_t phys_ram_size; #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT /* The list of ioperm_data */ static QLIST_HEAD(, ioperm_data) ioperm_head; #endif //#define DEBUG_MEMREG #ifdef DEBUG_MEMREG #define DPRINTF(fmt, args...) \ do { fprintf(stderr, "%s:%d " fmt , __func__, __LINE__, ##args); } while (0) #else #define DPRINTF(fmt, args...) do {} while (0) #endif #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) int kvm_abi = EXPECTED_KVM_API_VERSION; int kvm_page_size; #ifdef KVM_CAP_SET_GUEST_DEBUG static int kvm_debug(CPUState *env, struct kvm_debug_exit_arch *arch_info) { int handle = kvm_arch_debug(arch_info); if (handle) { kvm_debug_cpu_requested = env; env->stopped = 1; } return handle; } #endif static int handle_unhandled(uint64_t reason) { fprintf(stderr, "kvm: unhandled exit %" PRIx64 "\n", reason); return -EINVAL; } static inline void set_gsi(kvm_context_t kvm, unsigned int gsi) { uint32_t *bitmap = kvm->used_gsi_bitmap; if (gsi < kvm->max_gsi) bitmap[gsi / 32] |= 1U << (gsi % 32); else DPRINTF("Invalid GSI %u\n", gsi); } static inline void clear_gsi(kvm_context_t kvm, unsigned int gsi) { uint32_t *bitmap = kvm->used_gsi_bitmap; if (gsi < kvm->max_gsi) bitmap[gsi / 32] &= ~(1U << (gsi % 32)); else DPRINTF("Invalid GSI %u\n", gsi); } struct slot_info { unsigned long phys_addr; unsigned long len; unsigned long userspace_addr; unsigned flags; int logging_count; }; struct slot_info slots[KVM_MAX_NUM_MEM_REGIONS]; static void init_slots(void) { int i; for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) slots[i].len = 0; } static int get_free_slot(kvm_context_t kvm) { int i; int tss_ext; #if defined(KVM_CAP_SET_TSS_ADDR) && !defined(__s390__) tss_ext = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_TSS_ADDR); #else tss_ext = 0; #endif /* * on older kernels where the set tss ioctl is not supprted we must save * slot 0 to hold the extended memory, as the vmx will use the last 3 * pages of this slot. */ if (tss_ext > 0) i = 0; else i = 1; for (; i < KVM_MAX_NUM_MEM_REGIONS; ++i) if (!slots[i].len) return i; return -1; } static void register_slot(int slot, unsigned long phys_addr, unsigned long len, unsigned long userspace_addr, unsigned flags) { slots[slot].phys_addr = phys_addr; slots[slot].len = len; slots[slot].userspace_addr = userspace_addr; slots[slot].flags = flags; } static void free_slot(int slot) { slots[slot].len = 0; slots[slot].logging_count = 0; } static int get_slot(unsigned long phys_addr) { int i; for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) { if (slots[i].len && slots[i].phys_addr <= phys_addr && (slots[i].phys_addr + slots[i].len - 1) >= phys_addr) return i; } return -1; } /* Returns -1 if this slot is not totally contained on any other, * and the number of the slot otherwise */ static int get_container_slot(uint64_t phys_addr, unsigned long size) { int i; for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) if (slots[i].len && slots[i].phys_addr <= phys_addr && (slots[i].phys_addr + slots[i].len) >= phys_addr + size) return i; return -1; } int kvm_is_containing_region(kvm_context_t kvm, unsigned long phys_addr, unsigned long size) { int slot = get_container_slot(phys_addr, size); if (slot == -1) return 0; return 1; } /* * dirty pages logging control */ static int kvm_dirty_pages_log_change(kvm_context_t kvm, unsigned long phys_addr, unsigned flags, unsigned mask) { int r = -1; int slot = get_slot(phys_addr); if (slot == -1) { fprintf(stderr, "BUG: %s: invalid parameters\n", __FUNCTION__); return 1; } flags = (slots[slot].flags & ~mask) | flags; if (flags == slots[slot].flags) return 0; slots[slot].flags = flags; { struct kvm_userspace_memory_region mem = { .slot = slot, .memory_size = slots[slot].len, .guest_phys_addr = slots[slot].phys_addr, .userspace_addr = slots[slot].userspace_addr, .flags = slots[slot].flags, }; DPRINTF("slot %d start %llx len %llx flags %x\n", mem.slot, mem.guest_phys_addr, mem.memory_size, mem.flags); r = kvm_vm_ioctl(kvm_state, KVM_SET_USER_MEMORY_REGION, &mem); if (r < 0) fprintf(stderr, "%s: %m\n", __FUNCTION__); } return r; } static int kvm_dirty_pages_log_change_all(kvm_context_t kvm, int (*change)(kvm_context_t kvm, uint64_t start, uint64_t len)) { int i, r; for (i = r = 0; i < KVM_MAX_NUM_MEM_REGIONS && r == 0; i++) { if (slots[i].len) r = change(kvm, slots[i].phys_addr, slots[i].len); } return r; } int kvm_dirty_pages_log_enable_slot(kvm_context_t kvm, uint64_t phys_addr, uint64_t len) { int slot = get_slot(phys_addr); DPRINTF("start %" PRIx64 " len %" PRIx64 "\n", phys_addr, len); if (slot == -1) { fprintf(stderr, "BUG: %s: invalid parameters\n", __func__); return -EINVAL; } if (slots[slot].logging_count++) return 0; return kvm_dirty_pages_log_change(kvm, slots[slot].phys_addr, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_LOG_DIRTY_PAGES); } int kvm_dirty_pages_log_disable_slot(kvm_context_t kvm, uint64_t phys_addr, uint64_t len) { int slot = get_slot(phys_addr); if (slot == -1) { fprintf(stderr, "BUG: %s: invalid parameters\n", __func__); return -EINVAL; } if (--slots[slot].logging_count) return 0; return kvm_dirty_pages_log_change(kvm, slots[slot].phys_addr, 0, KVM_MEM_LOG_DIRTY_PAGES); } /** * Enable dirty page logging for all memory regions */ int kvm_dirty_pages_log_enable_all(kvm_context_t kvm) { if (kvm->dirty_pages_log_all) return 0; kvm->dirty_pages_log_all = 1; return kvm_dirty_pages_log_change_all(kvm, kvm_dirty_pages_log_enable_slot); } /** * Enable dirty page logging only for memory regions that were created with * dirty logging enabled (disable for all other memory regions). */ int kvm_dirty_pages_log_reset(kvm_context_t kvm) { if (!kvm->dirty_pages_log_all) return 0; kvm->dirty_pages_log_all = 0; return kvm_dirty_pages_log_change_all(kvm, kvm_dirty_pages_log_disable_slot); } static int kvm_create_context(void); int kvm_init(int smp_cpus) { int fd; int r, gsi_count; fd = open("/dev/kvm", O_RDWR); if (fd == -1) { perror("open /dev/kvm"); return -1; } r = ioctl(fd, KVM_GET_API_VERSION, 0); if (r == -1) { fprintf(stderr, "kvm kernel version too old: " "KVM_GET_API_VERSION ioctl not supported\n"); goto out_close; } if (r < EXPECTED_KVM_API_VERSION) { fprintf(stderr, "kvm kernel version too old: " "We expect API version %d or newer, but got " "version %d\n", EXPECTED_KVM_API_VERSION, r); goto out_close; } if (r > EXPECTED_KVM_API_VERSION) { fprintf(stderr, "kvm userspace version too old\n"); goto out_close; } kvm_abi = r; kvm_page_size = getpagesize(); kvm_state = qemu_mallocz(sizeof(*kvm_state)); kvm_context = &kvm_state->kvm_context; kvm_state->fd = fd; kvm_state->vmfd = -1; kvm_context->opaque = cpu_single_env; kvm_context->dirty_pages_log_all = 0; kvm_context->no_irqchip_creation = 0; kvm_context->no_pit_creation = 0; #ifdef KVM_CAP_SET_GUEST_DEBUG QTAILQ_INIT(&kvm_state->kvm_sw_breakpoints); #endif gsi_count = kvm_get_gsi_count(kvm_context); if (gsi_count > 0) { int gsi_bits, i; /* Round up so we can search ints using ffs */ gsi_bits = ALIGN(gsi_count, 32); kvm_context->used_gsi_bitmap = qemu_mallocz(gsi_bits / 8); kvm_context->max_gsi = gsi_bits; /* Mark any over-allocated bits as already in use */ for (i = gsi_count; i < gsi_bits; i++) set_gsi(kvm_context, i); } kvm_cpu_register_phys_memory_client(); pthread_mutex_lock(&qemu_mutex); return kvm_create_context(); out_close: close(fd); return -1; } static void kvm_finalize(KVMState *s) { /* FIXME if (kvm->vcpu_fd[0] != -1) close(kvm->vcpu_fd[0]); if (kvm->vm_fd != -1) close(kvm->vm_fd); */ close(s->fd); free(s); } void kvm_disable_irqchip_creation(kvm_context_t kvm) { kvm->no_irqchip_creation = 1; } void kvm_disable_pit_creation(kvm_context_t kvm) { kvm->no_pit_creation = 1; } static void kvm_create_vcpu(CPUState *env, int id) { long mmap_size; int r; KVMState *s = kvm_state; r = kvm_vm_ioctl(kvm_state, KVM_CREATE_VCPU, id); if (r < 0) { fprintf(stderr, "kvm_create_vcpu: %m\n"); return; } env->kvm_fd = r; env->kvm_state = kvm_state; mmap_size = kvm_ioctl(kvm_state, KVM_GET_VCPU_MMAP_SIZE, 0); if (mmap_size < 0) { fprintf(stderr, "get vcpu mmap size: %m\n"); goto err_fd; } env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, env->kvm_fd, 0); if (env->kvm_run == MAP_FAILED) { fprintf(stderr, "mmap vcpu area: %m\n"); goto err_fd; } #ifdef KVM_CAP_COALESCED_MMIO if (s->coalesced_mmio && !s->coalesced_mmio_ring) s->coalesced_mmio_ring = (void *) env->kvm_run + s->coalesced_mmio * PAGE_SIZE; #endif return; err_fd: close(env->kvm_fd); } static int kvm_set_boot_vcpu_id(kvm_context_t kvm, uint32_t id) { #ifdef KVM_CAP_SET_BOOT_CPU_ID int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_BOOT_CPU_ID); if (r > 0) return kvm_vm_ioctl(kvm_state, KVM_SET_BOOT_CPU_ID, id); return -ENOSYS; #else return -ENOSYS; #endif } int kvm_create_vm(kvm_context_t kvm) { int fd; #ifdef KVM_CAP_IRQ_ROUTING kvm->irq_routes = qemu_mallocz(sizeof(*kvm->irq_routes)); kvm->nr_allocated_irq_routes = 0; #endif fd = kvm_ioctl(kvm_state, KVM_CREATE_VM, 0); if (fd < 0) { fprintf(stderr, "kvm_create_vm: %m\n"); return -1; } kvm_state->vmfd = fd; return 0; } static int kvm_create_default_phys_mem(kvm_context_t kvm, unsigned long phys_mem_bytes, void **vm_mem) { #ifdef KVM_CAP_USER_MEMORY int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_USER_MEMORY); if (r > 0) return 0; fprintf(stderr, "Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported\n"); #else #error Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported #endif return -1; } void kvm_create_irqchip(kvm_context_t kvm) { int r; kvm->irqchip_in_kernel = 0; #ifdef KVM_CAP_IRQCHIP if (!kvm->no_irqchip_creation) { r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_IRQCHIP); if (r > 0) { /* kernel irqchip supported */ r = kvm_vm_ioctl(kvm_state, KVM_CREATE_IRQCHIP); if (r >= 0) { kvm->irqchip_inject_ioctl = KVM_IRQ_LINE; #if defined(KVM_CAP_IRQ_INJECT_STATUS) && defined(KVM_IRQ_LINE_STATUS) r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_IRQ_INJECT_STATUS); if (r > 0) kvm->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS; #endif kvm->irqchip_in_kernel = 1; } else fprintf(stderr, "Create kernel PIC irqchip failed\n"); } } #endif kvm_state->irqchip_in_kernel = kvm->irqchip_in_kernel; } int kvm_create(kvm_context_t kvm, unsigned long phys_mem_bytes, void **vm_mem) { int r; r = kvm_create_vm(kvm); if (r < 0) return r; r = kvm_arch_create(kvm, phys_mem_bytes, vm_mem); if (r < 0) return r; init_slots(); r = kvm_create_default_phys_mem(kvm, phys_mem_bytes, vm_mem); if (r < 0) return r; kvm_create_irqchip(kvm); return 0; } int kvm_register_phys_mem(kvm_context_t kvm, unsigned long phys_start, void *userspace_addr, unsigned long len, int log) { struct kvm_userspace_memory_region memory = { .memory_size = len, .guest_phys_addr = phys_start, .userspace_addr = (unsigned long) (uintptr_t) userspace_addr, .flags = log ? KVM_MEM_LOG_DIRTY_PAGES : 0, }; int r; memory.slot = get_free_slot(kvm); DPRINTF ("memory: gpa: %llx, size: %llx, uaddr: %llx, slot: %x, flags: %x\n", memory.guest_phys_addr, memory.memory_size, memory.userspace_addr, memory.slot, memory.flags); r = kvm_vm_ioctl(kvm_state, KVM_SET_USER_MEMORY_REGION, &memory); if (r < 0) { fprintf(stderr, "create_userspace_phys_mem: %s\n", strerror(-r)); return -1; } register_slot(memory.slot, memory.guest_phys_addr, memory.memory_size, memory.userspace_addr, memory.flags); return 0; } /* destroy/free a whole slot. * phys_start, len and slot are the params passed to kvm_create_phys_mem() */ void kvm_destroy_phys_mem(kvm_context_t kvm, unsigned long phys_start, unsigned long len) { int slot; int r; struct kvm_userspace_memory_region memory = { .memory_size = 0, .guest_phys_addr = phys_start, .userspace_addr = 0, .flags = 0, }; slot = get_slot(phys_start); if ((slot >= KVM_MAX_NUM_MEM_REGIONS) || (slot == -1)) { fprintf(stderr, "BUG: %s: invalid parameters (slot=%d)\n", __FUNCTION__, slot); return; } if (phys_start != slots[slot].phys_addr) { fprintf(stderr, "WARNING: %s: phys_start is 0x%lx expecting 0x%lx\n", __FUNCTION__, phys_start, slots[slot].phys_addr); phys_start = slots[slot].phys_addr; } memory.slot = slot; DPRINTF("slot %d start %llx len %llx flags %x\n", memory.slot, memory.guest_phys_addr, memory.memory_size, memory.flags); r = kvm_vm_ioctl(kvm_state, KVM_SET_USER_MEMORY_REGION, &memory); if (r < 0) { fprintf(stderr, "destroy_userspace_phys_mem: %s", strerror(-r)); return; } free_slot(memory.slot); } void kvm_unregister_memory_area(kvm_context_t kvm, uint64_t phys_addr, unsigned long size) { int slot = get_container_slot(phys_addr, size); if (slot != -1) { DPRINTF("Unregistering memory region %" PRIx64 " (%lx)\n", phys_addr, size); kvm_destroy_phys_mem(kvm, phys_addr, size); return; } } static int kvm_get_map(kvm_context_t kvm, int ioctl_num, int slot, void *buf) { int r; struct kvm_dirty_log log = { .slot = slot, }; log.dirty_bitmap = buf; r = kvm_vm_ioctl(kvm_state, ioctl_num, &log); if (r < 0) return r; return 0; } int kvm_get_dirty_pages(kvm_context_t kvm, unsigned long phys_addr, void *buf) { int slot; slot = get_slot(phys_addr); return kvm_get_map(kvm, KVM_GET_DIRTY_LOG, slot, buf); } int kvm_get_dirty_pages_range(kvm_context_t kvm, unsigned long phys_addr, unsigned long len, void *opaque, int (*cb)(unsigned long start, unsigned long len, void *bitmap, void *opaque)) { int i; int r; unsigned long end_addr = phys_addr + len; void *buf; for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) { if ((slots[i].len && (uint64_t) slots[i].phys_addr >= phys_addr) && ((uint64_t) slots[i].phys_addr + slots[i].len <= end_addr)) { buf = qemu_malloc(BITMAP_SIZE(slots[i].len)); r = kvm_get_map(kvm, KVM_GET_DIRTY_LOG, i, buf); if (r) { qemu_free(buf); return r; } r = cb(slots[i].phys_addr, slots[i].len, buf, opaque); qemu_free(buf); if (r) return r; } } return 0; } #ifdef KVM_CAP_IRQCHIP int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status) { struct kvm_irq_level event; int r; if (!kvm->irqchip_in_kernel) return 0; event.level = level; event.irq = irq; r = kvm_vm_ioctl(kvm_state, kvm->irqchip_inject_ioctl, &event); if (r < 0) perror("kvm_set_irq_level"); if (status) { #ifdef KVM_CAP_IRQ_INJECT_STATUS *status = (kvm->irqchip_inject_ioctl == KVM_IRQ_LINE) ? 1 : event.status; #else *status = 1; #endif } return 1; } int kvm_get_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip) { int r; if (!kvm->irqchip_in_kernel) return 0; r = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip); if (r < 0) { perror("kvm_get_irqchip\n"); } return r; } int kvm_set_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip) { int r; if (!kvm->irqchip_in_kernel) return 0; r = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip); if (r < 0) { perror("kvm_set_irqchip\n"); } return r; } #endif static int handle_debug(CPUState *env) { #ifdef KVM_CAP_SET_GUEST_DEBUG struct kvm_run *run = env->kvm_run; return kvm_debug(env, &run->debug.arch); #else return 0; #endif } int kvm_get_regs(CPUState *env, struct kvm_regs *regs) { return kvm_vcpu_ioctl(env, KVM_GET_REGS, regs); } int kvm_set_regs(CPUState *env, struct kvm_regs *regs) { return kvm_vcpu_ioctl(env, KVM_SET_REGS, regs); } int kvm_get_fpu(CPUState *env, struct kvm_fpu *fpu) { return kvm_vcpu_ioctl(env, KVM_GET_FPU, fpu); } int kvm_set_fpu(CPUState *env, struct kvm_fpu *fpu) { return kvm_vcpu_ioctl(env, KVM_SET_FPU, fpu); } int kvm_get_sregs(CPUState *env, struct kvm_sregs *sregs) { return kvm_vcpu_ioctl(env, KVM_GET_SREGS, sregs); } int kvm_set_sregs(CPUState *env, struct kvm_sregs *sregs) { return kvm_vcpu_ioctl(env, KVM_SET_SREGS, sregs); } #ifdef KVM_CAP_MP_STATE int kvm_get_mpstate(CPUState *env, struct kvm_mp_state *mp_state) { int r; r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE); if (r > 0) return kvm_vcpu_ioctl(env, KVM_GET_MP_STATE, mp_state); return -ENOSYS; } int kvm_set_mpstate(CPUState *env, struct kvm_mp_state *mp_state) { int r; r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE); if (r > 0) return kvm_vcpu_ioctl(env, KVM_SET_MP_STATE, mp_state); return -ENOSYS; } #endif static int handle_mmio(CPUState *env) { unsigned long addr = env->kvm_run->mmio.phys_addr; struct kvm_run *kvm_run = env->kvm_run; void *data = kvm_run->mmio.data; /* hack: Red Hat 7.1 generates these weird accesses. */ if ((addr > 0xa0000 - 4 && addr <= 0xa0000) && kvm_run->mmio.len == 3) return 0; cpu_physical_memory_rw(addr, data, kvm_run->mmio.len, kvm_run->mmio.is_write); return 0; } int handle_io_window(kvm_context_t kvm) { return 1; } int handle_shutdown(kvm_context_t kvm, CPUState *env) { /* stop the current vcpu from going back to guest mode */ env->stopped = 1; qemu_system_reset_request(); return 1; } static inline void push_nmi(kvm_context_t kvm) { #ifdef KVM_CAP_USER_NMI kvm_arch_push_nmi(kvm->opaque); #endif /* KVM_CAP_USER_NMI */ } void post_kvm_run(kvm_context_t kvm, CPUState *env) { pthread_mutex_lock(&qemu_mutex); kvm_arch_post_run(env, env->kvm_run); cpu_single_env = env; } int pre_kvm_run(kvm_context_t kvm, CPUState *env) { kvm_arch_pre_run(env, env->kvm_run); if (env->kvm_vcpu_dirty) { kvm_arch_load_regs(env, KVM_PUT_RUNTIME_STATE); env->kvm_vcpu_dirty = 0; } pthread_mutex_unlock(&qemu_mutex); return 0; } int kvm_is_ready_for_interrupt_injection(CPUState *env) { return env->kvm_run->ready_for_interrupt_injection; } static int kvm_handle_internal_error(kvm_context_t kvm, CPUState *env, struct kvm_run *run) { fprintf(stderr, "KVM internal error. Suberror: %d\n", run->internal.suberror); #ifdef KVM_CAP_INTERNAL_ERROR_DATA if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) { int i; for (i = 0; i < run->internal.ndata; ++i) { fprintf(stderr, "extra data[%d]: %"PRIx64"\n", i, (uint64_t)run->internal.data[i]); } } #endif kvm_show_regs(env); if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) fprintf(stderr, "emulation failure\n"); vm_stop(0); return 1; } int kvm_run(CPUState *env) { int r; kvm_context_t kvm = &env->kvm_state->kvm_context; struct kvm_run *run = env->kvm_run; int fd = env->kvm_fd; again: push_nmi(kvm); #if !defined(__s390__) if (!kvm->irqchip_in_kernel) run->request_interrupt_window = kvm_arch_try_push_interrupts(env); #endif r = pre_kvm_run(kvm, env); if (r) return r; r = ioctl(fd, KVM_RUN, 0); if (r == -1 && errno != EINTR && errno != EAGAIN) { r = -errno; post_kvm_run(kvm, env); fprintf(stderr, "kvm_run: %s\n", strerror(-r)); return r; } post_kvm_run(kvm, env); #if defined(KVM_CAP_COALESCED_MMIO) if (kvm_state->coalesced_mmio) { struct kvm_coalesced_mmio_ring *ring = (void *) run + kvm_state->coalesced_mmio * PAGE_SIZE; while (ring->first != ring->last) { cpu_physical_memory_rw(ring->coalesced_mmio[ring->first].phys_addr, &ring->coalesced_mmio[ring->first].data[0], ring->coalesced_mmio[ring->first].len, 1); smp_wmb(); ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; } } #endif #if !defined(__s390__) if (r == -1) { r = handle_io_window(kvm); goto more; } #endif if (1) { switch (run->exit_reason) { case KVM_EXIT_UNKNOWN: r = handle_unhandled(run->hw.hardware_exit_reason); break; case KVM_EXIT_FAIL_ENTRY: r = handle_unhandled(run->fail_entry.hardware_entry_failure_reason); break; case KVM_EXIT_EXCEPTION: fprintf(stderr, "exception %d (%x)\n", run->ex.exception, run->ex.error_code); kvm_show_regs(env); kvm_show_code(env); abort(); break; case KVM_EXIT_IO: r = kvm_handle_io(run->io.port, (uint8_t *)run + run->io.data_offset, run->io.direction, run->io.size, run->io.count); r = 0; break; case KVM_EXIT_DEBUG: r = handle_debug(env); break; case KVM_EXIT_MMIO: r = handle_mmio(env); break; case KVM_EXIT_HLT: r = kvm_arch_halt(env); break; case KVM_EXIT_IRQ_WINDOW_OPEN: break; case KVM_EXIT_SHUTDOWN: r = handle_shutdown(kvm, env); break; #if defined(__s390__) case KVM_EXIT_S390_SIEIC: r = kvm_s390_handle_intercept(kvm, env, run); break; case KVM_EXIT_S390_RESET: r = kvm_s390_handle_reset(kvm, env, run); break; #endif case KVM_EXIT_INTERNAL_ERROR: r = kvm_handle_internal_error(kvm, env, run); break; default: if (kvm_arch_run(env)) { fprintf(stderr, "unhandled vm exit: 0x%x\n", run->exit_reason); kvm_show_regs(env); abort(); } break; } } more: if (!r) goto again; return r; } int kvm_inject_irq(CPUState *env, unsigned irq) { struct kvm_interrupt intr; intr.irq = irq; return kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr); } int kvm_inject_nmi(CPUState *env) { #ifdef KVM_CAP_USER_NMI return kvm_vcpu_ioctl(env, KVM_NMI); #else return -ENOSYS; #endif } int kvm_init_coalesced_mmio(kvm_context_t kvm) { int r = 0; kvm_state->coalesced_mmio = 0; #ifdef KVM_CAP_COALESCED_MMIO r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO); if (r > 0) { kvm_state->coalesced_mmio = r; return 0; } #endif return r; } #ifdef KVM_CAP_DEVICE_ASSIGNMENT int kvm_assign_pci_device(kvm_context_t kvm, struct kvm_assigned_pci_dev *assigned_dev) { return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_PCI_DEVICE, assigned_dev); } static int kvm_old_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq) { return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_IRQ, assigned_irq); } #ifdef KVM_CAP_ASSIGN_DEV_IRQ int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq) { int ret; ret = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_ASSIGN_DEV_IRQ); if (ret > 0) { return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_DEV_IRQ, assigned_irq); } return kvm_old_assign_irq(kvm, assigned_irq); } int kvm_deassign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq) { return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_DEV_IRQ, assigned_irq); } #else int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq) { return kvm_old_assign_irq(kvm, assigned_irq); } #endif #endif #ifdef KVM_CAP_DEVICE_DEASSIGNMENT int kvm_deassign_pci_device(kvm_context_t kvm, struct kvm_assigned_pci_dev *assigned_dev) { return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_PCI_DEVICE, assigned_dev); } #endif int kvm_destroy_memory_region_works(kvm_context_t kvm) { int ret = 0; #ifdef KVM_CAP_DESTROY_MEMORY_REGION_WORKS ret = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_DESTROY_MEMORY_REGION_WORKS); if (ret <= 0) ret = 0; #endif return ret; } int kvm_reinject_control(kvm_context_t kvm, int pit_reinject) { #ifdef KVM_CAP_REINJECT_CONTROL int r; struct kvm_reinject_control control; control.pit_reinject = pit_reinject; r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_REINJECT_CONTROL); if (r > 0) { return kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control); } #endif return -ENOSYS; } int kvm_has_gsi_routing(kvm_context_t kvm) { int r = 0; #ifdef KVM_CAP_IRQ_ROUTING r = kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); #endif return r; } int kvm_get_gsi_count(kvm_context_t kvm) { #ifdef KVM_CAP_IRQ_ROUTING return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); #else return -EINVAL; #endif } int kvm_clear_gsi_routes(kvm_context_t kvm) { #ifdef KVM_CAP_IRQ_ROUTING kvm->irq_routes->nr = 0; return 0; #else return -EINVAL; #endif } int kvm_add_routing_entry(kvm_context_t kvm, struct kvm_irq_routing_entry *entry) { #ifdef KVM_CAP_IRQ_ROUTING struct kvm_irq_routing *z; struct kvm_irq_routing_entry *new; int n, size; if (kvm->irq_routes->nr == kvm->nr_allocated_irq_routes) { n = kvm->nr_allocated_irq_routes * 2; if (n < 64) n = 64; size = sizeof(struct kvm_irq_routing); size += n * sizeof(*new); z = realloc(kvm->irq_routes, size); if (!z) return -ENOMEM; kvm->nr_allocated_irq_routes = n; kvm->irq_routes = z; } n = kvm->irq_routes->nr++; new = &kvm->irq_routes->entries[n]; memset(new, 0, sizeof(*new)); new->gsi = entry->gsi; new->type = entry->type; new->flags = entry->flags; new->u = entry->u; set_gsi(kvm, entry->gsi); return 0; #else return -ENOSYS; #endif } int kvm_add_irq_route(kvm_context_t kvm, int gsi, int irqchip, int pin) { #ifdef KVM_CAP_IRQ_ROUTING struct kvm_irq_routing_entry e; e.gsi = gsi; e.type = KVM_IRQ_ROUTING_IRQCHIP; e.flags = 0; e.u.irqchip.irqchip = irqchip; e.u.irqchip.pin = pin; return kvm_add_routing_entry(kvm, &e); #else return -ENOSYS; #endif } int kvm_del_routing_entry(kvm_context_t kvm, struct kvm_irq_routing_entry *entry) { #ifdef KVM_CAP_IRQ_ROUTING struct kvm_irq_routing_entry *e, *p; int i, gsi, found = 0; gsi = entry->gsi; for (i = 0; i < kvm->irq_routes->nr; ++i) { e = &kvm->irq_routes->entries[i]; if (e->type == entry->type && e->gsi == gsi) { switch (e->type) { case KVM_IRQ_ROUTING_IRQCHIP:{ if (e->u.irqchip.irqchip == entry->u.irqchip.irqchip && e->u.irqchip.pin == entry->u.irqchip.pin) { p = &kvm->irq_routes->entries[--kvm->irq_routes->nr]; *e = *p; found = 1; } break; } case KVM_IRQ_ROUTING_MSI:{ if (e->u.msi.address_lo == entry->u.msi.address_lo && e->u.msi.address_hi == entry->u.msi.address_hi && e->u.msi.data == entry->u.msi.data) { p = &kvm->irq_routes->entries[--kvm->irq_routes->nr]; *e = *p; found = 1; } break; } default: break; } if (found) { /* If there are no other users of this GSI * mark it available in the bitmap */ for (i = 0; i < kvm->irq_routes->nr; i++) { e = &kvm->irq_routes->entries[i]; if (e->gsi == gsi) break; } if (i == kvm->irq_routes->nr) clear_gsi(kvm, gsi); return 0; } } } return -ESRCH; #else return -ENOSYS; #endif } int kvm_update_routing_entry(kvm_context_t kvm, struct kvm_irq_routing_entry *entry, struct kvm_irq_routing_entry *newentry) { #ifdef KVM_CAP_IRQ_ROUTING struct kvm_irq_routing_entry *e; int i; if (entry->gsi != newentry->gsi || entry->type != newentry->type) { return -EINVAL; } for (i = 0; i < kvm->irq_routes->nr; ++i) { e = &kvm->irq_routes->entries[i]; if (e->type != entry->type || e->gsi != entry->gsi) { continue; } switch (e->type) { case KVM_IRQ_ROUTING_IRQCHIP: if (e->u.irqchip.irqchip == entry->u.irqchip.irqchip && e->u.irqchip.pin == entry->u.irqchip.pin) { memcpy(&e->u.irqchip, &newentry->u.irqchip, sizeof e->u.irqchip); return 0; } break; case KVM_IRQ_ROUTING_MSI: if (e->u.msi.address_lo == entry->u.msi.address_lo && e->u.msi.address_hi == entry->u.msi.address_hi && e->u.msi.data == entry->u.msi.data) { memcpy(&e->u.msi, &newentry->u.msi, sizeof e->u.msi); return 0; } break; default: break; } } return -ESRCH; #else return -ENOSYS; #endif } int kvm_del_irq_route(kvm_context_t kvm, int gsi, int irqchip, int pin) { #ifdef KVM_CAP_IRQ_ROUTING struct kvm_irq_routing_entry e; e.gsi = gsi; e.type = KVM_IRQ_ROUTING_IRQCHIP; e.flags = 0; e.u.irqchip.irqchip = irqchip; e.u.irqchip.pin = pin; return kvm_del_routing_entry(kvm, &e); #else return -ENOSYS; #endif } int kvm_commit_irq_routes(kvm_context_t kvm) { #ifdef KVM_CAP_IRQ_ROUTING kvm->irq_routes->flags = 0; return kvm_vm_ioctl(kvm_state, KVM_SET_GSI_ROUTING, kvm->irq_routes); #else return -ENOSYS; #endif } int kvm_get_irq_route_gsi(kvm_context_t kvm) { int i, bit; uint32_t *buf = kvm->used_gsi_bitmap; /* Return the lowest unused GSI in the bitmap */ for (i = 0; i < kvm->max_gsi / 32; i++) { bit = ffs(~buf[i]); if (!bit) continue; return bit - 1 + i * 32; } return -ENOSPC; } #ifdef KVM_CAP_DEVICE_MSIX int kvm_assign_set_msix_nr(kvm_context_t kvm, struct kvm_assigned_msix_nr *msix_nr) { return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_NR, msix_nr); } int kvm_assign_set_msix_entry(kvm_context_t kvm, struct kvm_assigned_msix_entry *entry) { return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_ENTRY, entry); } #endif #if defined(KVM_CAP_IRQFD) && defined(CONFIG_EVENTFD) #include static int _kvm_irqfd(kvm_context_t kvm, int fd, int gsi, int flags) { struct kvm_irqfd data = { .fd = fd, .gsi = gsi, .flags = flags, }; return kvm_vm_ioctl(kvm_state, KVM_IRQFD, &data); } int kvm_irqfd(kvm_context_t kvm, int gsi, int flags) { int r; int fd; if (!kvm_check_extension(kvm_state, KVM_CAP_IRQFD)) return -ENOENT; fd = eventfd(0, 0); if (fd < 0) return -errno; r = _kvm_irqfd(kvm, fd, gsi, 0); if (r < 0) { close(fd); return -errno; } return fd; } #else /* KVM_CAP_IRQFD */ int kvm_irqfd(kvm_context_t kvm, int gsi, int flags) { return -ENOSYS; } #endif /* KVM_CAP_IRQFD */ unsigned long kvm_get_thread_id(void) { return syscall(SYS_gettid); } static void qemu_cond_wait(pthread_cond_t *cond) { CPUState *env = cpu_single_env; pthread_cond_wait(cond, &qemu_mutex); cpu_single_env = env; } static void sig_ipi_handler(int n) { } static void hardware_memory_error(void) { fprintf(stderr, "Hardware memory error!\n"); exit(1); } static void sigbus_reraise(void) { sigset_t set; struct sigaction action; memset(&action, 0, sizeof(action)); action.sa_handler = SIG_DFL; if (!sigaction(SIGBUS, &action, NULL)) { raise(SIGBUS); sigemptyset(&set); sigaddset(&set, SIGBUS); sigprocmask(SIG_UNBLOCK, &set, NULL); } perror("Failed to re-raise SIGBUS!\n"); abort(); } static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo, void *ctx) { #if defined(KVM_CAP_MCE) && defined(TARGET_I386) if (first_cpu->mcg_cap && siginfo->ssi_addr && siginfo->ssi_code == BUS_MCEERR_AO) { uint64_t status; unsigned long paddr; CPUState *cenv; /* Hope we are lucky for AO MCE */ if (do_qemu_ram_addr_from_host((void *)(intptr_t)siginfo->ssi_addr, &paddr)) { fprintf(stderr, "Hardware memory error for memory used by " "QEMU itself instead of guest system!: %llx\n", (unsigned long long)siginfo->ssi_addr); return; } status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S | 0xc0; kvm_inject_x86_mce(first_cpu, 9, status, MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr, (MCM_ADDR_PHYS << 6) | 0xc, 1); for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC, MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1); } else #endif { if (siginfo->ssi_code == BUS_MCEERR_AO) return; else if (siginfo->ssi_code == BUS_MCEERR_AR) hardware_memory_error(); else sigbus_reraise(); } } static void on_vcpu(CPUState *env, void (*func)(void *data), void *data) { struct qemu_work_item wi; if (env == current_env) { func(data); return; } wi.func = func; wi.data = data; if (!env->kvm_cpu_state.queued_work_first) env->kvm_cpu_state.queued_work_first = &wi; else env->kvm_cpu_state.queued_work_last->next = &wi; env->kvm_cpu_state.queued_work_last = &wi; wi.next = NULL; wi.done = false; pthread_kill(env->kvm_cpu_state.thread, SIG_IPI); while (!wi.done) qemu_cond_wait(&qemu_work_cond); } static void do_kvm_cpu_synchronize_state(void *_env) { CPUState *env = _env; if (!env->kvm_vcpu_dirty) { kvm_arch_save_regs(env); env->kvm_vcpu_dirty = 1; } } void kvm_cpu_synchronize_state(CPUState *env) { if (!env->kvm_vcpu_dirty) on_vcpu(env, do_kvm_cpu_synchronize_state, env); } void kvm_cpu_synchronize_post_reset(CPUState *env) { kvm_arch_load_regs(env, KVM_PUT_RESET_STATE); env->kvm_vcpu_dirty = 0; } void kvm_cpu_synchronize_post_init(CPUState *env) { kvm_arch_load_regs(env, KVM_PUT_FULL_STATE); env->kvm_vcpu_dirty = 0; } static void inject_interrupt(void *data) { cpu_interrupt(current_env, (long) data); } void kvm_inject_interrupt(CPUState *env, int mask) { on_vcpu(env, inject_interrupt, (void *) (long) mask); } void kvm_update_interrupt_request(CPUState *env) { int signal = 0; if (env) { if (!current_env || !current_env->created) signal = 1; /* * Testing for created here is really redundant */ if (current_env && current_env->created && env != current_env && !env->kvm_cpu_state.signalled) signal = 1; if (signal) { env->kvm_cpu_state.signalled = 1; if (env->kvm_cpu_state.thread) pthread_kill(env->kvm_cpu_state.thread, SIG_IPI); } } } int kvm_cpu_exec(CPUState *env) { int r; r = kvm_run(env); if (r < 0) { printf("kvm_run returned %d\n", r); vm_stop(0); } return 0; } int kvm_cpu_is_stopped(CPUState *env) { return !vm_running || env->stopped; } static void flush_queued_work(CPUState *env) { struct qemu_work_item *wi; if (!env->kvm_cpu_state.queued_work_first) return; while ((wi = env->kvm_cpu_state.queued_work_first)) { env->kvm_cpu_state.queued_work_first = wi->next; wi->func(wi->data); wi->done = true; } env->kvm_cpu_state.queued_work_last = NULL; pthread_cond_broadcast(&qemu_work_cond); } static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo) { #if defined(KVM_CAP_MCE) && defined(TARGET_I386) struct kvm_x86_mce mce = { .bank = 9, }; unsigned long paddr; int r; if (env->mcg_cap && siginfo->si_addr && (siginfo->si_code == BUS_MCEERR_AR || siginfo->si_code == BUS_MCEERR_AO)) { if (siginfo->si_code == BUS_MCEERR_AR) { /* Fake an Intel architectural Data Load SRAR UCR */ mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S | MCI_STATUS_AR | 0x134; mce.misc = (MCM_ADDR_PHYS << 6) | 0xc; mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; } else { /* Fake an Intel architectural Memory scrubbing UCR */ mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S | 0xc0; mce.misc = (MCM_ADDR_PHYS << 6) | 0xc; mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV; } if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) { fprintf(stderr, "Hardware memory error for memory used by " "QEMU itself instaed of guest system!\n"); /* Hope we are lucky for AO MCE */ if (siginfo->si_code == BUS_MCEERR_AO) return; else hardware_memory_error(); } mce.addr = paddr; r = kvm_set_mce(env, &mce); if (r < 0) { fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno)); abort(); } } else #endif { if (siginfo->si_code == BUS_MCEERR_AO) return; else if (siginfo->si_code == BUS_MCEERR_AR) hardware_memory_error(); else sigbus_reraise(); } } static void kvm_main_loop_wait(CPUState *env, int timeout) { struct timespec ts; int r, e; siginfo_t siginfo; sigset_t waitset; sigset_t chkset; ts.tv_sec = timeout / 1000; ts.tv_nsec = (timeout % 1000) * 1000000; sigemptyset(&waitset); sigaddset(&waitset, SIG_IPI); sigaddset(&waitset, SIGBUS); do { pthread_mutex_unlock(&qemu_mutex); r = sigtimedwait(&waitset, &siginfo, &ts); e = errno; pthread_mutex_lock(&qemu_mutex); if (r == -1 && !(e == EAGAIN || e == EINTR)) { printf("sigtimedwait: %s\n", strerror(e)); exit(1); } switch (r) { case SIGBUS: kvm_on_sigbus(env, &siginfo); break; default: break; } r = sigpending(&chkset); if (r == -1) { printf("sigpending: %s\n", strerror(e)); exit(1); } } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS)); cpu_single_env = env; flush_queued_work(env); if (env->stop) { env->stop = 0; env->stopped = 1; pthread_cond_signal(&qemu_pause_cond); } env->kvm_cpu_state.signalled = 0; } static int all_threads_paused(void) { CPUState *penv = first_cpu; while (penv) { if (penv->stop) return 0; penv = (CPUState *) penv->next_cpu; } return 1; } static void pause_all_threads(void) { CPUState *penv = first_cpu; while (penv) { if (penv != cpu_single_env) { penv->stop = 1; pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI); } else { penv->stop = 0; penv->stopped = 1; cpu_exit(penv); } penv = (CPUState *) penv->next_cpu; } while (!all_threads_paused()) qemu_cond_wait(&qemu_pause_cond); } static void resume_all_threads(void) { CPUState *penv = first_cpu; assert(!cpu_single_env); while (penv) { penv->stop = 0; penv->stopped = 0; pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI); penv = (CPUState *) penv->next_cpu; } } static void kvm_vm_state_change_handler(void *context, int running, int reason) { if (running) resume_all_threads(); else pause_all_threads(); } static void setup_kernel_sigmask(CPUState *env) { sigset_t set; sigemptyset(&set); sigaddset(&set, SIGUSR2); sigaddset(&set, SIGIO); sigaddset(&set, SIGALRM); sigprocmask(SIG_BLOCK, &set, NULL); sigprocmask(SIG_BLOCK, NULL, &set); sigdelset(&set, SIG_IPI); sigdelset(&set, SIGBUS); kvm_set_signal_mask(env, &set); } static void qemu_kvm_system_reset(void) { CPUState *penv = first_cpu; pause_all_threads(); qemu_system_reset(); while (penv) { kvm_arch_cpu_reset(penv); penv = (CPUState *) penv->next_cpu; } resume_all_threads(); } static void process_irqchip_events(CPUState *env) { kvm_arch_process_irqchip_events(env); if (kvm_arch_has_work(env)) env->halted = 0; } static int kvm_main_loop_cpu(CPUState *env) { while (1) { int run_cpu = !kvm_cpu_is_stopped(env); if (run_cpu && !kvm_irqchip_in_kernel()) { process_irqchip_events(env); run_cpu = !env->halted; } if (run_cpu) { kvm_cpu_exec(env); kvm_main_loop_wait(env, 0); } else { kvm_main_loop_wait(env, 1000); } } pthread_mutex_unlock(&qemu_mutex); return 0; } static void *ap_main_loop(void *_env) { CPUState *env = _env; sigset_t signals; #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT struct ioperm_data *data = NULL; #endif current_env = env; env->thread_id = kvm_get_thread_id(); sigfillset(&signals); sigprocmask(SIG_BLOCK, &signals, NULL); kvm_create_vcpu(env, env->cpu_index); #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT /* do ioperm for io ports of assigned devices */ QLIST_FOREACH(data, &ioperm_head, entries) on_vcpu(env, kvm_arch_do_ioperm, data); #endif setup_kernel_sigmask(env); pthread_mutex_lock(&qemu_mutex); cpu_single_env = env; kvm_arch_init_vcpu(env); /* signal VCPU creation */ current_env->created = 1; pthread_cond_signal(&qemu_vcpu_cond); /* and wait for machine initialization */ while (!qemu_system_ready) qemu_cond_wait(&qemu_system_cond); /* re-initialize cpu_single_env after re-acquiring qemu_mutex */ cpu_single_env = env; kvm_main_loop_cpu(env); return NULL; } int kvm_init_vcpu(CPUState *env) { pthread_create(&env->kvm_cpu_state.thread, NULL, ap_main_loop, env); while (env->created == 0) qemu_cond_wait(&qemu_vcpu_cond); return 0; } int kvm_vcpu_inited(CPUState *env) { return env->created; } #ifdef TARGET_I386 void kvm_hpet_disable_kpit(void) { struct kvm_pit_state2 ps2; kvm_get_pit2(kvm_context, &ps2); ps2.flags |= KVM_PIT_FLAGS_HPET_LEGACY; kvm_set_pit2(kvm_context, &ps2); } void kvm_hpet_enable_kpit(void) { struct kvm_pit_state2 ps2; kvm_get_pit2(kvm_context, &ps2); ps2.flags &= ~KVM_PIT_FLAGS_HPET_LEGACY; kvm_set_pit2(kvm_context, &ps2); } #endif int kvm_init_ap(void) { struct sigaction action; qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL); signal(SIG_IPI, sig_ipi_handler); memset(&action, 0, sizeof(action)); action.sa_flags = SA_SIGINFO; action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler; sigaction(SIGBUS, &action, NULL); prctl(PR_MCE_KILL, 1, 1, 0, 0); return 0; } void qemu_kvm_notify_work(void) { /* Write 8 bytes to be compatible with eventfd. */ static uint64_t val = 1; ssize_t ret; if (io_thread_fd == -1) return; do { ret = write(io_thread_fd, &val, sizeof(val)); } while (ret < 0 && errno == EINTR); /* EAGAIN is fine in case we have a pipe. */ if (ret < 0 && errno != EAGAIN) { fprintf(stderr, "qemu_kvm_notify_work: write() filed: %s\n", strerror(errno)); exit (1); } } /* If we have signalfd, we mask out the signals we want to handle and then * use signalfd to listen for them. We rely on whatever the current signal * handler is to dispatch the signals when we receive them. */ static void sigfd_handler(void *opaque) { int fd = (unsigned long) opaque; struct qemu_signalfd_siginfo info; struct sigaction action; ssize_t len; while (1) { do { len = read(fd, &info, sizeof(info)); } while (len == -1 && errno == EINTR); if (len == -1 && errno == EAGAIN) break; if (len != sizeof(info)) { printf("read from sigfd returned %zd: %m\n", len); return; } sigaction(info.ssi_signo, NULL, &action); if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) action.sa_sigaction(info.ssi_signo, (siginfo_t *)&info, NULL); else if (action.sa_handler) action.sa_handler(info.ssi_signo); } } /* Used to break IO thread out of select */ static void io_thread_wakeup(void *opaque) { int fd = (unsigned long) opaque; ssize_t len; char buffer[512]; /* Drain the notify pipe. For eventfd, only 8 bytes will be read. */ do { len = read(fd, buffer, sizeof(buffer)); } while ((len == -1 && errno == EINTR) || len == sizeof(buffer)); } int kvm_main_loop(void) { int fds[2]; sigset_t mask; int sigfd; io_thread = pthread_self(); qemu_system_ready = 1; if (qemu_eventfd(fds) == -1) { fprintf(stderr, "failed to create eventfd\n"); return -errno; } fcntl(fds[0], F_SETFL, O_NONBLOCK); fcntl(fds[1], F_SETFL, O_NONBLOCK); qemu_set_fd_handler2(fds[0], NULL, io_thread_wakeup, NULL, (void *)(unsigned long) fds[0]); io_thread_fd = fds[1]; sigemptyset(&mask); sigaddset(&mask, SIGIO); sigaddset(&mask, SIGALRM); sigaddset(&mask, SIGBUS); sigprocmask(SIG_BLOCK, &mask, NULL); sigfd = qemu_signalfd(&mask); if (sigfd == -1) { fprintf(stderr, "failed to create signalfd\n"); return -errno; } fcntl(sigfd, F_SETFL, O_NONBLOCK); qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL, (void *)(unsigned long) sigfd); pthread_cond_broadcast(&qemu_system_cond); io_thread_sigfd = sigfd; cpu_single_env = NULL; while (1) { main_loop_wait(1000); if (qemu_shutdown_requested()) { if (qemu_no_shutdown()) { vm_stop(0); } else break; } else if (qemu_powerdown_requested()) { monitor_protocol_event(QEVENT_POWERDOWN, NULL); qemu_irq_raise(qemu_system_powerdown); } else if (qemu_reset_requested()) { monitor_protocol_event(QEVENT_RESET, NULL); qemu_kvm_system_reset(); } else if (kvm_debug_cpu_requested) { monitor_protocol_event(QEVENT_DEBUG, NULL); gdb_set_stop_cpu(kvm_debug_cpu_requested); vm_stop(EXCP_DEBUG); kvm_debug_cpu_requested = NULL; } } pause_all_threads(); pthread_mutex_unlock(&qemu_mutex); return 0; } #ifdef TARGET_I386 static int destroy_region_works = 0; #endif #if !defined(TARGET_I386) int kvm_arch_init_irq_routing(void) { return 0; } #endif extern int no_hpet; static int kvm_create_context(void) { int r; if (!kvm_irqchip) { kvm_disable_irqchip_creation(kvm_context); } if (!kvm_pit) { kvm_disable_pit_creation(kvm_context); } if (kvm_create(kvm_context, 0, NULL) < 0) { kvm_finalize(kvm_state); return -1; } r = kvm_arch_qemu_create_context(); if (r < 0) { kvm_finalize(kvm_state); return -1; } if (kvm_pit && !kvm_pit_reinject) { if (kvm_reinject_control(kvm_context, 0)) { fprintf(stderr, "failure to disable in-kernel PIT reinjection\n"); return -1; } } #ifdef TARGET_I386 destroy_region_works = kvm_destroy_memory_region_works(kvm_context); #endif r = kvm_arch_init_irq_routing(); if (r < 0) { return r; } kvm_state->vcpu_events = 0; #ifdef KVM_CAP_VCPU_EVENTS kvm_state->vcpu_events = kvm_check_extension(kvm_state, KVM_CAP_VCPU_EVENTS); #endif kvm_init_ap(); if (kvm_irqchip) { if (!qemu_kvm_has_gsi_routing()) { irq0override = 0; #ifdef TARGET_I386 /* if kernel can't do irq routing, interrupt source * override 0->2 can not be set up as required by hpet, * so disable hpet. */ no_hpet = 1; } else if (!qemu_kvm_has_pit_state2()) { no_hpet = 1; } #else } #endif } return 0; } #ifdef TARGET_I386 static int must_use_aliases_source(target_phys_addr_t addr) { if (destroy_region_works) return false; if (addr == 0xa0000 || addr == 0xa8000) return true; return false; } static int must_use_aliases_target(target_phys_addr_t addr) { if (destroy_region_works) return false; if (addr >= 0xe0000000 && addr < 0x100000000ull) return true; return false; } static struct mapping { target_phys_addr_t phys; ram_addr_t ram; ram_addr_t len; } mappings[50]; static int nr_mappings; static struct mapping *find_ram_mapping(ram_addr_t ram_addr) { struct mapping *p; for (p = mappings; p < mappings + nr_mappings; ++p) { if (p->ram <= ram_addr && ram_addr < p->ram + p->len) { return p; } } return NULL; } static struct mapping *find_mapping(target_phys_addr_t start_addr) { struct mapping *p; for (p = mappings; p < mappings + nr_mappings; ++p) { if (p->phys <= start_addr && start_addr < p->phys + p->len) { return p; } } return NULL; } static void drop_mapping(target_phys_addr_t start_addr) { struct mapping *p = find_mapping(start_addr); if (p) *p = mappings[--nr_mappings]; } #endif void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size, ram_addr_t phys_offset) { int r = 0; unsigned long area_flags; #ifdef TARGET_I386 struct mapping *p; #endif if (start_addr + size > phys_ram_size) { phys_ram_size = start_addr + size; } phys_offset &= ~IO_MEM_ROM; area_flags = phys_offset & ~TARGET_PAGE_MASK; if (area_flags != IO_MEM_RAM) { #ifdef TARGET_I386 if (must_use_aliases_source(start_addr)) { kvm_destroy_memory_alias(kvm_context, start_addr); return; } if (must_use_aliases_target(start_addr)) return; #endif while (size > 0) { p = find_mapping(start_addr); if (p) { kvm_unregister_memory_area(kvm_context, p->phys, p->len); drop_mapping(p->phys); } start_addr += TARGET_PAGE_SIZE; if (size > TARGET_PAGE_SIZE) { size -= TARGET_PAGE_SIZE; } else { size = 0; } } return; } r = kvm_is_containing_region(kvm_context, start_addr, size); if (r) return; if (area_flags >= TLB_MMIO) return; #ifdef TARGET_I386 if (must_use_aliases_source(start_addr)) { p = find_ram_mapping(phys_offset); if (p) { kvm_create_memory_alias(kvm_context, start_addr, size, p->phys + (phys_offset - p->ram)); } return; } #endif r = kvm_register_phys_mem(kvm_context, start_addr, qemu_get_ram_ptr(phys_offset), size, 0); if (r < 0) { printf("kvm_cpu_register_physical_memory: failed\n"); exit(1); } #ifdef TARGET_I386 drop_mapping(start_addr); p = &mappings[nr_mappings++]; p->phys = start_addr; p->ram = phys_offset; p->len = size; #endif return; } /* * dirty pages logging */ /* FIXME: use unsigned long pointer instead of unsigned char */ unsigned char *kvm_dirty_bitmap = NULL; int kvm_physical_memory_set_dirty_tracking(int enable) { int r = 0; if (!kvm_enabled()) return 0; if (enable) { if (!kvm_dirty_bitmap) { unsigned bitmap_size = BITMAP_SIZE(phys_ram_size); kvm_dirty_bitmap = qemu_malloc(bitmap_size); r = kvm_dirty_pages_log_enable_all(kvm_context); } } else { if (kvm_dirty_bitmap) { r = kvm_dirty_pages_log_reset(kvm_context); qemu_free(kvm_dirty_bitmap); kvm_dirty_bitmap = NULL; } } return r; } /* get kvm's dirty pages bitmap and update qemu's */ static int kvm_get_dirty_pages_log_range(unsigned long start_addr, unsigned long *bitmap, unsigned long offset, unsigned long mem_size) { unsigned int i, j; unsigned long page_number, addr, addr1, c; ram_addr_t ram_addr; unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) / HOST_LONG_BITS; /* * bitmap-traveling is faster than memory-traveling (for addr...) * especially when most of the memory is not dirty. */ for (i = 0; i < len; i++) { if (bitmap[i] != 0) { c = leul_to_cpu(bitmap[i]); do { j = ffsl(c) - 1; c &= ~(1ul << j); page_number = i * HOST_LONG_BITS + j; addr1 = page_number * TARGET_PAGE_SIZE; addr = offset + addr1; ram_addr = cpu_get_physical_page_desc(addr); cpu_physical_memory_set_dirty(ram_addr); } while (c != 0); } } return 0; } static int kvm_get_dirty_bitmap_cb(unsigned long start, unsigned long len, void *bitmap, void *opaque) { return kvm_get_dirty_pages_log_range(start, bitmap, start, len); } /* * get kvm's dirty pages bitmap and update qemu's * we only care about physical ram, which resides in slots 0 and 3 */ int kvm_update_dirty_pages_log(void) { int r = 0; r = kvm_get_dirty_pages_range(kvm_context, 0, -1UL, NULL, kvm_get_dirty_bitmap_cb); return r; } void kvm_qemu_log_memory(target_phys_addr_t start, target_phys_addr_t size, int log) { if (log) kvm_dirty_pages_log_enable_slot(kvm_context, start, size); else { #ifdef TARGET_I386 if (must_use_aliases_target(start)) return; #endif kvm_dirty_pages_log_disable_slot(kvm_context, start, size); } } #ifdef KVM_CAP_IRQCHIP int kvm_set_irq(int irq, int level, int *status) { return kvm_set_irq_level(kvm_context, irq, level, status); } #endif int qemu_kvm_get_dirty_pages(unsigned long phys_addr, void *buf) { return kvm_get_dirty_pages(kvm_context, phys_addr, buf); } void kvm_mutex_unlock(void) { assert(!cpu_single_env); pthread_mutex_unlock(&qemu_mutex); } void kvm_mutex_lock(void) { pthread_mutex_lock(&qemu_mutex); cpu_single_env = NULL; } void qemu_mutex_unlock_iothread(void) { if (kvm_enabled()) kvm_mutex_unlock(); } void qemu_mutex_lock_iothread(void) { if (kvm_enabled()) kvm_mutex_lock(); } #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT void kvm_add_ioperm_data(struct ioperm_data *data) { QLIST_INSERT_HEAD(&ioperm_head, data, entries); } void kvm_remove_ioperm_data(unsigned long start_port, unsigned long num) { struct ioperm_data *data; data = QLIST_FIRST(&ioperm_head); while (data) { struct ioperm_data *next = QLIST_NEXT(data, entries); if (data->start_port == start_port && data->num == num) { QLIST_REMOVE(data, entries); qemu_free(data); } data = next; } } void kvm_ioperm(CPUState *env, void *data) { if (kvm_enabled() && qemu_system_ready) on_vcpu(env, kvm_arch_do_ioperm, data); } #endif int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr, target_phys_addr_t end_addr) { #ifndef TARGET_IA64 #ifdef TARGET_I386 if (must_use_aliases_source(start_addr)) return 0; #endif kvm_get_dirty_pages_range(kvm_context, start_addr, end_addr - start_addr, NULL, kvm_get_dirty_bitmap_cb); #endif return 0; } int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t len) { #ifdef TARGET_I386 if (must_use_aliases_source(phys_addr)) return 0; #endif #ifndef TARGET_IA64 kvm_qemu_log_memory(phys_addr, len, 1); #endif return 0; } int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t len) { #ifdef TARGET_I386 if (must_use_aliases_source(phys_addr)) return 0; #endif #ifndef TARGET_IA64 kvm_qemu_log_memory(phys_addr, len, 0); #endif return 0; } int kvm_set_boot_cpu_id(uint32_t id) { return kvm_set_boot_vcpu_id(kvm_context, id); } #ifdef TARGET_I386 #ifdef KVM_CAP_MCE struct kvm_x86_mce_data { CPUState *env; struct kvm_x86_mce *mce; int abort_on_error; }; static void kvm_do_inject_x86_mce(void *_data) { struct kvm_x86_mce_data *data = _data; int r; r = kvm_set_mce(data->env, data->mce); if (r < 0) { perror("kvm_set_mce FAILED"); if (data->abort_on_error) abort(); } } #endif void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, uint64_t mcg_status, uint64_t addr, uint64_t misc, int abort_on_error) { #ifdef KVM_CAP_MCE struct kvm_x86_mce mce = { .bank = bank, .status = status, .mcg_status = mcg_status, .addr = addr, .misc = misc, }; struct kvm_x86_mce_data data = { .env = cenv, .mce = &mce, .abort_on_error = abort_on_error, }; if (!cenv->mcg_cap) { fprintf(stderr, "MCE support is not enabled!\n"); return; } on_vcpu(cenv, kvm_do_inject_x86_mce, &data); #else if (abort_on_error) abort(); #endif } #endif