diff options
author | Marcelo Tosatti <mtosatti@redhat.com> | 2010-08-26 20:55:36 -0300 |
---|---|---|
committer | Marcelo Tosatti <mtosatti@redhat.com> | 2010-08-26 20:55:36 -0300 |
commit | 4813f440385bfc751d1c91752709b4457ed803e2 (patch) | |
tree | a4f2897a48ee88a845c20f7fd2fa2910cd8f6f12 | |
parent | ee67c3f4534efd89a03d874c5d91b125c7b4ed51 (diff) | |
parent | 6977dfe6af975d72a8140dbc91effe8b8f2a58f8 (diff) |
Merge commit '6977dfe6af975d72a8140dbc91effe8b8f2a58f8' into upstream-merge
* commit '6977dfe6af975d72a8140dbc91effe8b8f2a58f8': (24 commits)
exec: remove code duplication in qemu_ram_alloc() and qemu_ram_alloc_from_ptr()
exec: replace tabs by spaces.
arch_init: replace tabs by spaces.
pckbd: support for commands 0xf0-0xff: Pulse output bit
Replace qemu_malloc + memset with qemu_mallocz
Use ARRAY_SIZE macro
Remove useless NULL check for qemu_strdup return value
Remove useless NULL checks for qemu_malloc return value
savevm: Reset last block info at beginning of each save
set proper migration status on ->write error (v5)
rtc: Remove TARGET_I386 from qemu-config.c, enables driftfix
QEMUFileBuffered: indicate that we're ready when the underlying file is ready
sparc escc IUS improvements (SunOS 4.1.4 fix)
Fix mingw32 build
win32: Avoid compiler warning (WIN32_LEAN_AND_MEAN redefined)
win32: Add missing function setenv
Disable build of ivshmem on non-KVM systems
Add kvm_set_ioeventfd_mmio_long definition for non-KVM systems
TCG: Revert ppc64 tcg_out_movi32 change
TCG: Fix Darwin/ppc calling convention recognition
...
Conflicts:
exec.c
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
-rw-r--r-- | Makefile.target | 3 | ||||
-rw-r--r-- | arch_init.c | 19 | ||||
-rw-r--r-- | buffered_file.c | 12 | ||||
-rw-r--r-- | docs/specs/ivshmem_device_spec.txt | 96 | ||||
-rw-r--r-- | exec.c | 140 | ||||
-rw-r--r-- | hw/escc.c | 56 | ||||
-rw-r--r-- | hw/hw.h | 2 | ||||
-rw-r--r-- | hw/ivshmem.c | 828 | ||||
-rw-r--r-- | hw/pckbd.c | 23 | ||||
-rw-r--r-- | hw/sh_intc.c | 3 | ||||
-rw-r--r-- | hw/virtio-9p-local.c | 3 | ||||
-rw-r--r-- | hw/virtio-9p.c | 3 | ||||
-rw-r--r-- | kvm-all.c | 32 | ||||
-rw-r--r-- | kvm-stub.c | 5 | ||||
-rw-r--r-- | kvm.h | 1 | ||||
-rw-r--r-- | migration.c | 8 | ||||
-rw-r--r-- | os-posix.c | 2 | ||||
-rw-r--r-- | os-win32.c | 15 | ||||
-rw-r--r-- | osdep.h | 2 | ||||
-rw-r--r-- | qemu-char.c | 7 | ||||
-rw-r--r-- | qemu-char.h | 3 | ||||
-rw-r--r-- | qemu-config.c | 2 | ||||
-rw-r--r-- | qemu-doc.texi | 43 | ||||
-rw-r--r-- | savevm.c | 44 | ||||
-rw-r--r-- | target-i386/cpuid.c | 2 | ||||
-rw-r--r-- | tcg/ppc/tcg-target.h | 2 | ||||
-rw-r--r-- | tcg/ppc64/tcg-target.c | 2 | ||||
-rw-r--r-- | ui/sdl.c | 4 | ||||
-rw-r--r-- | vl.c | 12 |
29 files changed, 1221 insertions, 153 deletions
diff --git a/Makefile.target b/Makefile.target index 9e13d99cb..9643f88ec 100644 --- a/Makefile.target +++ b/Makefile.target @@ -205,6 +205,9 @@ obj-$(CONFIG_USB_OHCI) += usb-ohci.o obj-y += rtl8139.o obj-y += e1000.o +# Inter-VM PCI shared memory +obj-$(CONFIG_KVM) += ivshmem.o + # Hardware support obj-i386-y += vga.o obj-i386-y += mc146818rtc.o i8259.o pc.o diff --git a/arch_init.c b/arch_init.c index 47bb4b2d8..e468c0c7e 100644 --- a/arch_init.c +++ b/arch_init.c @@ -82,12 +82,12 @@ const uint32_t arch_type = QEMU_ARCH; /***********************************************************/ /* ram save/restore */ -#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ -#define RAM_SAVE_FLAG_COMPRESS 0x02 -#define RAM_SAVE_FLAG_MEM_SIZE 0x04 -#define RAM_SAVE_FLAG_PAGE 0x08 -#define RAM_SAVE_FLAG_EOS 0x10 -#define RAM_SAVE_FLAG_CONTINUE 0x20 +#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ +#define RAM_SAVE_FLAG_COMPRESS 0x02 +#define RAM_SAVE_FLAG_MEM_SIZE 0x04 +#define RAM_SAVE_FLAG_PAGE 0x08 +#define RAM_SAVE_FLAG_EOS 0x10 +#define RAM_SAVE_FLAG_CONTINUE 0x20 static int is_dup_page(uint8_t *page, uint8_t ch) { @@ -104,10 +104,11 @@ static int is_dup_page(uint8_t *page, uint8_t ch) return 1; } +static RAMBlock *last_block; +static ram_addr_t last_offset; + static int ram_save_block(QEMUFile *f) { - static RAMBlock *last_block = NULL; - static ram_addr_t last_offset = 0; RAMBlock *block = last_block; ram_addr_t offset = last_offset; ram_addr_t current_addr; @@ -231,6 +232,8 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) if (stage == 1) { RAMBlock *block; bytes_transferred = 0; + last_block = NULL; + last_offset = 0; /* Make sure all dirty bits are set */ QLIST_FOREACH(block, &ram_list.blocks, next) { diff --git a/buffered_file.c b/buffered_file.c index 54dc6c29b..1836e7e24 100644 --- a/buffered_file.c +++ b/buffered_file.c @@ -156,6 +156,14 @@ static int buffered_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, in offset = size; } + if (pos == 0 && size == 0) { + DPRINTF("file is ready\n"); + if (s->bytes_xfer <= s->xfer_limit) { + DPRINTF("notifying client\n"); + s->put_ready(s->opaque); + } + } + return offset; } @@ -222,8 +230,10 @@ static void buffered_rate_tick(void *opaque) { QEMUFileBuffered *s = opaque; - if (s->has_error) + if (s->has_error) { + buffered_close(s); return; + } qemu_mod_timer(s->timer, qemu_get_clock(rt_clock) + 100); diff --git a/docs/specs/ivshmem_device_spec.txt b/docs/specs/ivshmem_device_spec.txt new file mode 100644 index 000000000..23dd2ba89 --- /dev/null +++ b/docs/specs/ivshmem_device_spec.txt @@ -0,0 +1,96 @@ + +Device Specification for Inter-VM shared memory device +------------------------------------------------------ + +The Inter-VM shared memory device is designed to share a region of memory to +userspace in multiple virtual guests. The memory region does not belong to any +guest, but is a POSIX memory object on the host. Optionally, the device may +support sending interrupts to other guests sharing the same memory region. + + +The Inter-VM PCI device +----------------------- + +*BARs* + +The device supports three BARs. BAR0 is a 1 Kbyte MMIO region to support +registers. BAR1 is used for MSI-X when it is enabled in the device. BAR2 is +used to map the shared memory object from the host. The size of BAR2 is +specified when the guest is started and must be a power of 2 in size. + +*Registers* + +The device currently supports 4 registers of 32-bits each. Registers +are used for synchronization between guests sharing the same memory object when +interrupts are supported (this requires using the shared memory server). + +The server assigns each VM an ID number and sends this ID number to the Qemu +process when the guest starts. + +enum ivshmem_registers { + IntrMask = 0, + IntrStatus = 4, + IVPosition = 8, + Doorbell = 12 +}; + +The first two registers are the interrupt mask and status registers. Mask and +status are only used with pin-based interrupts. They are unused with MSI +interrupts. + +Status Register: The status register is set to 1 when an interrupt occurs. + +Mask Register: The mask register is bitwise ANDed with the interrupt status +and the result will raise an interrupt if it is non-zero. However, since 1 is +the only value the status will be set to, it is only the first bit of the mask +that has any effect. Therefore interrupts can be masked by setting the first +bit to 0 and unmasked by setting the first bit to 1. + +IVPosition Register: The IVPosition register is read-only and reports the +guest's ID number. The guest IDs are non-negative integers. When using the +server, since the server is a separate process, the VM ID will only be set when +the device is ready (shared memory is received from the server and accessible via +the device). If the device is not ready, the IVPosition will return -1. +Applications should ensure that they have a valid VM ID before accessing the +shared memory. + +Doorbell Register: To interrupt another guest, a guest must write to the +Doorbell register. The doorbell register is 32-bits, logically divided into +two 16-bit fields. The high 16-bits are the guest ID to interrupt and the low +16-bits are the interrupt vector to trigger. The semantics of the value +written to the doorbell depends on whether the device is using MSI or a regular +pin-based interrupt. In short, MSI uses vectors while regular interrupts set the +status register. + +Regular Interrupts + +If regular interrupts are used (due to either a guest not supporting MSI or the +user specifying not to use them on startup) then the value written to the lower +16-bits of the Doorbell register results is arbitrary and will trigger an +interrupt in the destination guest. + +Message Signalled Interrupts + +A ivshmem device may support multiple MSI vectors. If so, the lower 16-bits +written to the Doorbell register must be between 0 and the maximum number of +vectors the guest supports. The lower 16 bits written to the doorbell is the +MSI vector that will be raised in the destination guest. The number of MSI +vectors is configurable but it is set when the VM is started. + +The important thing to remember with MSI is that it is only a signal, no status +is set (since MSI interrupts are not shared). All information other than the +interrupt itself should be communicated via the shared memory region. Devices +supporting multiple MSI vectors can use different vectors to indicate different +events have occurred. The semantics of interrupt vectors are left to the +user's discretion. + + +Usage in the Guest +------------------ + +The shared memory device is intended to be used with the provided UIO driver. +Very little configuration is needed. The guest should map BAR0 to access the +registers (an array of 32-bit ints allows simple writing) and map BAR2 to +access the shared memory region itself. The size of the shared memory region +is specified when the guest (or shared memory server) is started. A guest may +map the whole shared memory region or only part of it. @@ -1722,8 +1722,8 @@ static QLIST_HEAD(memory_client_list, CPUPhysMemoryClient) memory_client_list = QLIST_HEAD_INITIALIZER(memory_client_list); static void cpu_notify_set_memory(target_phys_addr_t start_addr, - ram_addr_t size, - ram_addr_t phys_offset) + ram_addr_t size, + ram_addr_t phys_offset) { CPUPhysMemoryClient *client; QLIST_FOREACH(client, &memory_client_list, list) { @@ -1732,7 +1732,7 @@ static void cpu_notify_set_memory(target_phys_addr_t start_addr, } static int cpu_notify_sync_dirty_bitmap(target_phys_addr_t start, - target_phys_addr_t end) + target_phys_addr_t end) { CPUPhysMemoryClient *client; QLIST_FOREACH(client, &memory_client_list, list) { @@ -1819,17 +1819,17 @@ int cpu_str_to_log_mask(const char *str) p1 = strchr(p, ','); if (!p1) p1 = p + strlen(p); - if(cmp1(p,p1-p,"all")) { - for(item = cpu_log_items; item->mask != 0; item++) { - mask |= item->mask; - } - } else { - for(item = cpu_log_items; item->mask != 0; item++) { - if (cmp1(p, p1 - p, item->name)) - goto found; + if(cmp1(p,p1-p,"all")) { + for(item = cpu_log_items; item->mask != 0; item++) { + mask |= item->mask; + } + } else { + for(item = cpu_log_items; item->mask != 0; item++) { + if (cmp1(p, p1 - p, item->name)) + goto found; + } + return 0; } - return 0; - } found: mask |= item->mask; if (*p1 != ',') @@ -1923,11 +1923,11 @@ static inline void tlb_flush_jmp_cache(CPUState *env, target_ulong addr) overlap the flushed page. */ i = tb_jmp_cache_hash_page(addr - TARGET_PAGE_SIZE); memset (&env->tb_jmp_cache[i], 0, - TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *)); + TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *)); i = tb_jmp_cache_hash_page(addr); memset (&env->tb_jmp_cache[i], 0, - TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *)); + TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *)); } static CPUTLBEntry s_cputlb_empty_entry = { @@ -2703,16 +2703,16 @@ static long gethugepagesize(const char *path) int ret; do { - ret = statfs(path, &fs); + ret = statfs(path, &fs); } while (ret != 0 && errno == EINTR); if (ret != 0) { - perror(path); - return 0; + perror(path); + return 0; } if (fs.f_type != HUGETLBFS_MAGIC) - fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path); + fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path); return fs.f_bsize; } @@ -2731,7 +2731,7 @@ static void *file_ram_alloc(RAMBlock *block, hpagesize = gethugepagesize(path); if (!hpagesize) { - return NULL; + return NULL; } if (memory < hpagesize) { @@ -2744,14 +2744,14 @@ static void *file_ram_alloc(RAMBlock *block, } if (asprintf(&filename, "%s/qemu_back_mem.XXXXXX", path) == -1) { - return NULL; + return NULL; } fd = mkstemp(filename); if (fd < 0) { - perror("unable to create backing store for hugepages"); - free(filename); - return NULL; + perror("unable to create backing store for hugepages"); + free(filename); + return NULL; } unlink(filename); free(filename); @@ -2765,7 +2765,7 @@ static void *file_ram_alloc(RAMBlock *block, * mmap will fail. */ if (ftruncate(fd, memory)) - perror("ftruncate"); + perror("ftruncate"); #ifdef MAP_POPULATE /* NB: MAP_POPULATE won't exhaustively alloc all phys pages in the case @@ -2778,9 +2778,9 @@ static void *file_ram_alloc(RAMBlock *block, area = mmap(0, memory, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); #endif if (area == MAP_FAILED) { - perror("file_ram_alloc: can't mmap RAM pages"); - close(fd); - return (NULL); + perror("file_ram_alloc: can't mmap RAM pages"); + close(fd); + return (NULL); } block->fd = fd; return area; @@ -2825,49 +2825,7 @@ static ram_addr_t last_ram_offset(void) } ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name, - ram_addr_t size, void *host) -{ - RAMBlock *new_block, *block; - - size = TARGET_PAGE_ALIGN(size); - new_block = qemu_mallocz(sizeof(*new_block)); - - if (dev && dev->parent_bus && dev->parent_bus->info->get_dev_path) { - char *id = dev->parent_bus->info->get_dev_path(dev); - if (id) { - snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id); - qemu_free(id); - } - } - pstrcat(new_block->idstr, sizeof(new_block->idstr), name); - - QLIST_FOREACH(block, &ram_list.blocks, next) { - if (!strcmp(block->idstr, new_block->idstr)) { - fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n", - new_block->idstr); - abort(); - } - } - - new_block->host = host; - - new_block->offset = find_ram_offset(size); - new_block->length = size; - - QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next); - - ram_list.phys_dirty = qemu_realloc(ram_list.phys_dirty, - last_ram_offset() >> TARGET_PAGE_BITS); - memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS), - 0xff, size >> TARGET_PAGE_BITS); - - if (kvm_enabled()) - kvm_setup_guest_memory(new_block->host, size); - - return new_block->offset; -} - -ram_addr_t qemu_ram_alloc(DeviceState *dev, const char *name, ram_addr_t size) + ram_addr_t size, void *host) { RAMBlock *new_block, *block; @@ -2891,32 +2849,37 @@ ram_addr_t qemu_ram_alloc(DeviceState *dev, const char *name, ram_addr_t size) } } - if (mem_path) { + if (host) { + new_block->host = host; + } else { + if (mem_path) { #if defined (__linux__) && !defined(TARGET_S390X) - new_block->host = file_ram_alloc(new_block, size, mem_path); - if (!new_block->host) { - new_block->host = qemu_vmalloc(size); + new_block->host = file_ram_alloc(new_block, size, mem_path); + if (!new_block->host) { + new_block->host = qemu_vmalloc(size); #ifdef MADV_MERGEABLE - madvise(new_block->host, size, MADV_MERGEABLE); + madvise(new_block->host, size, MADV_MERGEABLE); #endif - } + } #else - fprintf(stderr, "-mem-path option unsupported\n"); - exit(1); + fprintf(stderr, "-mem-path option unsupported\n"); + exit(1); #endif - } else { + } else { #if defined(TARGET_S390X) && defined(CONFIG_KVM) - /* XXX S390 KVM requires the topmost vma of the RAM to be < 256GB */ - new_block->host = mmap((void*)0x1000000, size, - PROT_EXEC|PROT_READ|PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); + /* XXX S390 KVM requires the topmost vma of the RAM to be < 256GB */ + new_block->host = mmap((void*)0x1000000, size, + PROT_EXEC|PROT_READ|PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); #else - new_block->host = qemu_vmalloc(size); + new_block->host = qemu_vmalloc(size); #endif #ifdef MADV_MERGEABLE - madvise(new_block->host, size, MADV_MERGEABLE); + madvise(new_block->host, size, MADV_MERGEABLE); #endif + } } + new_block->offset = find_ram_offset(size); new_block->length = size; @@ -2946,6 +2909,11 @@ void qemu_ram_unmap(ram_addr_t addr) } } +ram_addr_t qemu_ram_alloc(DeviceState *dev, const char *name, ram_addr_t size) +{ + return qemu_ram_alloc_from_ptr(dev, name, size, NULL); +} + void qemu_ram_free(ram_addr_t addr) { RAMBlock *block; @@ -65,6 +65,8 @@ * 2006-Aug-10 Igor Kovalenko : Renamed KBDQueue to SERIOQueue, implemented * serial mouse queue. * Implemented serial mouse protocol. + * + * 2010-May-23 Artyom Tarasenko: Reworked IUS logic */ #ifdef DEBUG_SERIAL @@ -279,7 +281,7 @@ static uint32_t get_queue(void *opaque) static int escc_update_irq_chn(ChannelState *s) { - if ((((s->wregs[W_INTR] & INTR_TXINT) && s->txint == 1) || + if ((((s->wregs[W_INTR] & INTR_TXINT) && (s->txint == 1)) || // tx ints enabled, pending ((((s->wregs[W_INTR] & INTR_RXMODEMSK) == INTR_RXINT1ST) || ((s->wregs[W_INTR] & INTR_RXMODEMSK) == INTR_RXINTALL)) && @@ -342,24 +344,22 @@ static void escc_reset(DeviceState *d) static inline void set_rxint(ChannelState *s) { s->rxint = 1; - if (!s->txint_under_svc) { - s->rxint_under_svc = 1; - if (s->chn == chn_a) { - if (s->wregs[W_MINTR] & MINTR_STATUSHI) - s->otherchn->rregs[R_IVEC] = IVEC_HIRXINTA; - else - s->otherchn->rregs[R_IVEC] = IVEC_LORXINTA; - } else { - if (s->wregs[W_MINTR] & MINTR_STATUSHI) - s->rregs[R_IVEC] = IVEC_HIRXINTB; - else - s->rregs[R_IVEC] = IVEC_LORXINTB; - } - } - if (s->chn == chn_a) + /* XXX: missing daisy chainnig: chn_b rx should have a lower priority + than chn_a rx/tx/special_condition service*/ + s->rxint_under_svc = 1; + if (s->chn == chn_a) { s->rregs[R_INTR] |= INTR_RXINTA; - else + if (s->wregs[W_MINTR] & MINTR_STATUSHI) + s->otherchn->rregs[R_IVEC] = IVEC_HIRXINTA; + else + s->otherchn->rregs[R_IVEC] = IVEC_LORXINTA; + } else { s->otherchn->rregs[R_INTR] |= INTR_RXINTB; + if (s->wregs[W_MINTR] & MINTR_STATUSHI) + s->rregs[R_IVEC] = IVEC_HIRXINTB; + else + s->rregs[R_IVEC] = IVEC_LORXINTB; + } escc_update_irq(s); } @@ -369,19 +369,17 @@ static inline void set_txint(ChannelState *s) if (!s->rxint_under_svc) { s->txint_under_svc = 1; if (s->chn == chn_a) { + s->rregs[R_INTR] |= INTR_TXINTA; if (s->wregs[W_MINTR] & MINTR_STATUSHI) s->otherchn->rregs[R_IVEC] = IVEC_HITXINTA; else s->otherchn->rregs[R_IVEC] = IVEC_LOTXINTA; } else { s->rregs[R_IVEC] = IVEC_TXINTB; + s->otherchn->rregs[R_INTR] |= INTR_TXINTB; } - } - if (s->chn == chn_a) - s->rregs[R_INTR] |= INTR_TXINTA; - else - s->otherchn->rregs[R_INTR] |= INTR_TXINTB; escc_update_irq(s); + } } static inline void clr_rxint(ChannelState *s) @@ -417,6 +415,7 @@ static inline void clr_txint(ChannelState *s) s->otherchn->rregs[R_IVEC] = IVEC_LONOINT; s->rregs[R_INTR] &= ~INTR_TXINTA; } else { + s->otherchn->rregs[R_INTR] &= ~INTR_TXINTB; if (s->wregs[W_MINTR] & MINTR_STATUSHI) s->rregs[R_IVEC] = IVEC_HINOINT; else @@ -515,10 +514,15 @@ static void escc_mem_writeb(void *opaque, target_phys_addr_t addr, uint32_t val) clr_txint(s); break; case CMD_CLR_IUS: - if (s->rxint_under_svc) - clr_rxint(s); - else if (s->txint_under_svc) - clr_txint(s); + if (s->rxint_under_svc) { + s->rxint_under_svc = 0; + if (s->txint) { + set_txint(s); + } + } else if (s->txint_under_svc) { + s->txint_under_svc = 0; + } + escc_update_irq(s); break; default: break; @@ -264,6 +264,8 @@ int register_savevm_live(DeviceState *dev, void *opaque); void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque); +void register_device_unmigratable(DeviceState *dev, const char *idstr, + void *opaque); typedef void QEMUResetHandler(void *opaque); diff --git a/hw/ivshmem.c b/hw/ivshmem.c new file mode 100644 index 000000000..bbb5cbaa1 --- /dev/null +++ b/hw/ivshmem.c @@ -0,0 +1,828 @@ +/* + * Inter-VM Shared Memory PCI device. + * + * Author: + * Cam Macdonell <cam@cs.ualberta.ca> + * + * Based On: cirrus_vga.c + * Copyright (c) 2004 Fabrice Bellard + * Copyright (c) 2004 Makoto Suzuki (suzu) + * + * and rtl8139.c + * Copyright (c) 2006 Igor Kovalenko + * + * This code is licensed under the GNU GPL v2. + */ +#include "hw.h" +#include "pc.h" +#include "pci.h" +#include "msix.h" +#include "kvm.h" + +#include <sys/mman.h> +#include <sys/types.h> + +#define IVSHMEM_IOEVENTFD 0 +#define IVSHMEM_MSI 1 + +#define IVSHMEM_PEER 0 +#define IVSHMEM_MASTER 1 + +#define IVSHMEM_REG_BAR_SIZE 0x100 + +//#define DEBUG_IVSHMEM +#ifdef DEBUG_IVSHMEM +#define IVSHMEM_DPRINTF(fmt, ...) \ + do {printf("IVSHMEM: " fmt, ## __VA_ARGS__); } while (0) +#else +#define IVSHMEM_DPRINTF(fmt, ...) +#endif + +typedef struct Peer { + int nb_eventfds; + int *eventfds; +} Peer; + +typedef struct EventfdEntry { + PCIDevice *pdev; + int vector; +} EventfdEntry; + +typedef struct IVShmemState { + PCIDevice dev; + uint32_t intrmask; + uint32_t intrstatus; + uint32_t doorbell; + + CharDriverState **eventfd_chr; + CharDriverState *server_chr; + int ivshmem_mmio_io_addr; + + pcibus_t mmio_addr; + pcibus_t shm_pci_addr; + uint64_t ivshmem_offset; + uint64_t ivshmem_size; /* size of shared memory region */ + int shm_fd; /* shared memory file descriptor */ + + Peer *peers; + int nb_peers; /* how many guests we have space for */ + int max_peer; /* maximum numbered peer */ + + int vm_id; + uint32_t vectors; + uint32_t features; + EventfdEntry *eventfd_table; + + char * shmobj; + char * sizearg; + char * role; + int role_val; /* scalar to avoid multiple string comparisons */ +} IVShmemState; + +/* registers for the Inter-VM shared memory device */ +enum ivshmem_registers { + INTRMASK = 0, + INTRSTATUS = 4, + IVPOSITION = 8, + DOORBELL = 12, +}; + +static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, + unsigned int feature) { + return (ivs->features & (1 << feature)); +} + +static inline bool is_power_of_two(uint64_t x) { + return (x & (x - 1)) == 0; +} + +static void ivshmem_map(PCIDevice *pci_dev, int region_num, + pcibus_t addr, pcibus_t size, int type) +{ + IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev); + + s->shm_pci_addr = addr; + + if (s->ivshmem_offset > 0) { + cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size, + s->ivshmem_offset); + } + + IVSHMEM_DPRINTF("guest pci addr = %" FMT_PCIBUS ", guest h/w addr = %" + PRIu64 ", size = %" FMT_PCIBUS "\n", addr, s->ivshmem_offset, size); + +} + +/* accessing registers - based on rtl8139 */ +static void ivshmem_update_irq(IVShmemState *s, int val) +{ + int isr; + isr = (s->intrstatus & s->intrmask) & 0xffffffff; + + /* don't print ISR resets */ + if (isr) { + IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n", + isr ? 1 : 0, s->intrstatus, s->intrmask); + } + + qemu_set_irq(s->dev.irq[0], (isr != 0)); +} + +static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val) +{ + IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val); + + s->intrmask = val; + + ivshmem_update_irq(s, val); +} + +static uint32_t ivshmem_IntrMask_read(IVShmemState *s) +{ + uint32_t ret = s->intrmask; + + IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret); + + return ret; +} + +static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val) +{ + IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val); + + s->intrstatus = val; + + ivshmem_update_irq(s, val); + return; +} + +static uint32_t ivshmem_IntrStatus_read(IVShmemState *s) +{ + uint32_t ret = s->intrstatus; + + /* reading ISR clears all interrupts */ + s->intrstatus = 0; + + ivshmem_update_irq(s, 0); + + return ret; +} + +static void ivshmem_io_writew(void *opaque, target_phys_addr_t addr, + uint32_t val) +{ + + IVSHMEM_DPRINTF("We shouldn't be writing words\n"); +} + +static void ivshmem_io_writel(void *opaque, target_phys_addr_t addr, + uint32_t val) +{ + IVShmemState *s = opaque; + + uint64_t write_one = 1; + uint16_t dest = val >> 16; + uint16_t vector = val & 0xff; + + addr &= 0xfc; + + IVSHMEM_DPRINTF("writing to addr " TARGET_FMT_plx "\n", addr); + switch (addr) + { + case INTRMASK: + ivshmem_IntrMask_write(s, val); + break; + + case INTRSTATUS: + ivshmem_IntrStatus_write(s, val); + break; + + case DOORBELL: + /* check that dest VM ID is reasonable */ + if ((dest < 0) || (dest > s->max_peer)) { + IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest); + break; + } + + /* check doorbell range */ + if ((vector >= 0) && (vector < s->peers[dest].nb_eventfds)) { + IVSHMEM_DPRINTF("Writing %" PRId64 " to VM %d on vector %d\n", + write_one, dest, vector); + if (write(s->peers[dest].eventfds[vector], + &(write_one), 8) != 8) { + IVSHMEM_DPRINTF("error writing to eventfd\n"); + } + } + break; + default: + IVSHMEM_DPRINTF("Invalid VM Doorbell VM %d\n", dest); + } +} + +static void ivshmem_io_writeb(void *opaque, target_phys_addr_t addr, + uint32_t val) +{ + IVSHMEM_DPRINTF("We shouldn't be writing bytes\n"); +} + +static uint32_t ivshmem_io_readw(void *opaque, target_phys_addr_t addr) +{ + + IVSHMEM_DPRINTF("We shouldn't be reading words\n"); + return 0; +} + +static uint32_t ivshmem_io_readl(void *opaque, target_phys_addr_t addr) +{ + + IVShmemState *s = opaque; + uint32_t ret; + + switch (addr) + { + case INTRMASK: + ret = ivshmem_IntrMask_read(s); + break; + + case INTRSTATUS: + ret = ivshmem_IntrStatus_read(s); + break; + + case IVPOSITION: + /* return my VM ID if the memory is mapped */ + if (s->shm_fd > 0) { + ret = s->vm_id; + } else { + ret = -1; + } + break; + + default: + IVSHMEM_DPRINTF("why are we reading " TARGET_FMT_plx "\n", addr); + ret = 0; + } + + return ret; +} + +static uint32_t ivshmem_io_readb(void *opaque, target_phys_addr_t addr) +{ + IVSHMEM_DPRINTF("We shouldn't be reading bytes\n"); + + return 0; +} + +static CPUReadMemoryFunc * const ivshmem_mmio_read[3] = { + ivshmem_io_readb, + ivshmem_io_readw, + ivshmem_io_readl, +}; + +static CPUWriteMemoryFunc * const ivshmem_mmio_write[3] = { + ivshmem_io_writeb, + ivshmem_io_writew, + ivshmem_io_writel, +}; + +static void ivshmem_receive(void *opaque, const uint8_t *buf, int size) +{ + IVShmemState *s = opaque; + + ivshmem_IntrStatus_write(s, *buf); + + IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf); +} + +static int ivshmem_can_receive(void * opaque) +{ + return 8; +} + +static void ivshmem_event(void *opaque, int event) +{ + IVSHMEM_DPRINTF("ivshmem_event %d\n", event); +} + +static void fake_irqfd(void *opaque, const uint8_t *buf, int size) { + + EventfdEntry *entry = opaque; + PCIDevice *pdev = entry->pdev; + + IVSHMEM_DPRINTF("interrupt on vector %p %d\n", pdev, entry->vector); + msix_notify(pdev, entry->vector); +} + +static CharDriverState* create_eventfd_chr_device(void * opaque, int eventfd, + int vector) +{ + /* create a event character device based on the passed eventfd */ + IVShmemState *s = opaque; + CharDriverState * chr; + + chr = qemu_chr_open_eventfd(eventfd); + + if (chr == NULL) { + fprintf(stderr, "creating eventfd for eventfd %d failed\n", eventfd); + exit(-1); + } + + /* if MSI is supported we need multiple interrupts */ + if (ivshmem_has_feature(s, IVSHMEM_MSI)) { + s->eventfd_table[vector].pdev = &s->dev; + s->eventfd_table[vector].vector = vector; + + qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd, + ivshmem_event, &s->eventfd_table[vector]); + } else { + qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive, + ivshmem_event, s); + } + + return chr; + +} + +static int check_shm_size(IVShmemState *s, int fd) { + /* check that the guest isn't going to try and map more memory than the + * the object has allocated return -1 to indicate error */ + + struct stat buf; + + fstat(fd, &buf); + + if (s->ivshmem_size > buf.st_size) { + fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater"); + fprintf(stderr, " than shared object size (%" PRIu64 " > %ld)\n", + s->ivshmem_size, buf.st_size); + return -1; + } else { + return 0; + } +} + +/* create the shared memory BAR when we are not using the server, so we can + * create the BAR and map the memory immediately */ +static void create_shared_memory_BAR(IVShmemState *s, int fd) { + + void * ptr; + + s->shm_fd = fd; + + ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + + s->ivshmem_offset = qemu_ram_alloc_from_ptr(&s->dev.qdev, "ivshmem.bar2", + s->ivshmem_size, ptr); + + /* region for shared memory */ + pci_register_bar(&s->dev, 2, s->ivshmem_size, + PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map); +} + +static void close_guest_eventfds(IVShmemState *s, int posn) +{ + int i, guest_curr_max; + + guest_curr_max = s->peers[posn].nb_eventfds; + + for (i = 0; i < guest_curr_max; i++) { + kvm_set_ioeventfd_mmio_long(s->peers[posn].eventfds[i], + s->mmio_addr + DOORBELL, (posn << 16) | i, 0); + close(s->peers[posn].eventfds[i]); + } + + qemu_free(s->peers[posn].eventfds); + s->peers[posn].nb_eventfds = 0; +} + +static void setup_ioeventfds(IVShmemState *s) { + + int i, j; + + for (i = 0; i <= s->max_peer; i++) { + for (j = 0; j < s->peers[i].nb_eventfds; j++) { + kvm_set_ioeventfd_mmio_long(s->peers[i].eventfds[j], + s->mmio_addr + DOORBELL, (i << 16) | j, 1); + } + } +} + +/* this function increase the dynamic storage need to store data about other + * guests */ +static void increase_dynamic_storage(IVShmemState *s, int new_min_size) { + + int j, old_nb_alloc; + + old_nb_alloc = s->nb_peers; + + while (new_min_size >= s->nb_peers) + s->nb_peers = s->nb_peers * 2; + + IVSHMEM_DPRINTF("bumping storage to %d guests\n", s->nb_peers); + s->peers = qemu_realloc(s->peers, s->nb_peers * sizeof(Peer)); + + /* zero out new pointers */ + for (j = old_nb_alloc; j < s->nb_peers; j++) { + s->peers[j].eventfds = NULL; + s->peers[j].nb_eventfds = 0; + } +} + +static void ivshmem_read(void *opaque, const uint8_t * buf, int flags) +{ + IVShmemState *s = opaque; + int incoming_fd, tmp_fd; + int guest_max_eventfd; + long incoming_posn; + + memcpy(&incoming_posn, buf, sizeof(long)); + /* pick off s->server_chr->msgfd and store it, posn should accompany msg */ + tmp_fd = qemu_chr_get_msgfd(s->server_chr); + IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd); + + /* make sure we have enough space for this guest */ + if (incoming_posn >= s->nb_peers) { + increase_dynamic_storage(s, incoming_posn); + } + + if (tmp_fd == -1) { + /* if posn is positive and unseen before then this is our posn*/ + if ((incoming_posn >= 0) && + (s->peers[incoming_posn].eventfds == NULL)) { + /* receive our posn */ + s->vm_id = incoming_posn; + return; + } else { + /* otherwise an fd == -1 means an existing guest has gone away */ + IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn); + close_guest_eventfds(s, incoming_posn); + return; + } + } + + /* because of the implementation of get_msgfd, we need a dup */ + incoming_fd = dup(tmp_fd); + + if (incoming_fd == -1) { + fprintf(stderr, "could not allocate file descriptor %s\n", + strerror(errno)); + return; + } + + /* if the position is -1, then it's shared memory region fd */ + if (incoming_posn == -1) { + + void * map_ptr; + + s->max_peer = 0; + + if (check_shm_size(s, incoming_fd) == -1) { + exit(-1); + } + + /* mmap the region and map into the BAR2 */ + map_ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, + incoming_fd, 0); + s->ivshmem_offset = qemu_ram_alloc_from_ptr(&s->dev.qdev, + "ivshmem.bar2", s->ivshmem_size, map_ptr); + + IVSHMEM_DPRINTF("guest pci addr = %" FMT_PCIBUS ", guest h/w addr = %" + PRIu64 ", size = %" PRIu64 "\n", s->shm_pci_addr, + s->ivshmem_offset, s->ivshmem_size); + + if (s->shm_pci_addr > 0) { + /* map memory into BAR2 */ + cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size, + s->ivshmem_offset); + } + + /* only store the fd if it is successfully mapped */ + s->shm_fd = incoming_fd; + + return; + } + + /* each guest has an array of eventfds, and we keep track of how many + * guests for each VM */ + guest_max_eventfd = s->peers[incoming_posn].nb_eventfds; + + if (guest_max_eventfd == 0) { + /* one eventfd per MSI vector */ + s->peers[incoming_posn].eventfds = (int *) qemu_malloc(s->vectors * + sizeof(int)); + } + + /* this is an eventfd for a particular guest VM */ + IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn, + guest_max_eventfd, incoming_fd); + s->peers[incoming_posn].eventfds[guest_max_eventfd] = incoming_fd; + + /* increment count for particular guest */ + s->peers[incoming_posn].nb_eventfds++; + + /* keep track of the maximum VM ID */ + if (incoming_posn > s->max_peer) { + s->max_peer = incoming_posn; + } + + if (incoming_posn == s->vm_id) { + s->eventfd_chr[guest_max_eventfd] = create_eventfd_chr_device(s, + s->peers[s->vm_id].eventfds[guest_max_eventfd], + guest_max_eventfd); + } + + if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) { + if (kvm_set_ioeventfd_mmio_long(incoming_fd, s->mmio_addr + DOORBELL, + (incoming_posn << 16) | guest_max_eventfd, 1) < 0) { + fprintf(stderr, "ivshmem: ioeventfd not available\n"); + } + } + + return; +} + +static void ivshmem_reset(DeviceState *d) +{ + IVShmemState *s = DO_UPCAST(IVShmemState, dev.qdev, d); + + s->intrstatus = 0; + return; +} + +static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num, + pcibus_t addr, pcibus_t size, int type) +{ + IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev); + + s->mmio_addr = addr; + cpu_register_physical_memory(addr + 0, IVSHMEM_REG_BAR_SIZE, + s->ivshmem_mmio_io_addr); + + if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) { + setup_ioeventfds(s); + } +} + +static uint64_t ivshmem_get_size(IVShmemState * s) { + + uint64_t value; + char *ptr; + + value = strtoull(s->sizearg, &ptr, 10); + switch (*ptr) { + case 0: case 'M': case 'm': + value <<= 20; + break; + case 'G': case 'g': + value <<= 30; + break; + default: + fprintf(stderr, "qemu: invalid ram size: %s\n", s->sizearg); + exit(1); + } + + /* BARs must be a power of 2 */ + if (!is_power_of_two(value)) { + fprintf(stderr, "ivshmem: size must be power of 2\n"); + exit(1); + } + + return value; +} + +static void ivshmem_setup_msi(IVShmemState * s) { + + int i; + + /* allocate the MSI-X vectors */ + + if (!msix_init(&s->dev, s->vectors, 1, 0)) { + pci_register_bar(&s->dev, 1, + msix_bar_size(&s->dev), + PCI_BASE_ADDRESS_SPACE_MEMORY, + msix_mmio_map); + IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors); + } else { + IVSHMEM_DPRINTF("msix initialization failed\n"); + exit(1); + } + + /* 'activate' the vectors */ + for (i = 0; i < s->vectors; i++) { + msix_vector_use(&s->dev, i); + } + + /* allocate Qemu char devices for receiving interrupts */ + s->eventfd_table = qemu_mallocz(s->vectors * sizeof(EventfdEntry)); +} + +static void ivshmem_save(QEMUFile* f, void *opaque) +{ + IVShmemState *proxy = opaque; + + IVSHMEM_DPRINTF("ivshmem_save\n"); + pci_device_save(&proxy->dev, f); + + if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) { + msix_save(&proxy->dev, f); + } else { + qemu_put_be32(f, proxy->intrstatus); + qemu_put_be32(f, proxy->intrmask); + } + +} + +static int ivshmem_load(QEMUFile* f, void *opaque, int version_id) +{ + IVSHMEM_DPRINTF("ivshmem_load\n"); + + IVShmemState *proxy = opaque; + int ret, i; + + if (version_id > 0) { + return -EINVAL; + } + + if (proxy->role_val == IVSHMEM_PEER) { + fprintf(stderr, "ivshmem: 'peer' devices are not migratable\n"); + return -EINVAL; + } + + ret = pci_device_load(&proxy->dev, f); + if (ret) { + return ret; + } + + if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) { + msix_load(&proxy->dev, f); + for (i = 0; i < proxy->vectors; i++) { + msix_vector_use(&proxy->dev, i); + } + } else { + proxy->intrstatus = qemu_get_be32(f); + proxy->intrmask = qemu_get_be32(f); + } + + return 0; +} + +static int pci_ivshmem_init(PCIDevice *dev) +{ + IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev); + uint8_t *pci_conf; + + if (s->sizearg == NULL) + s->ivshmem_size = 4 << 20; /* 4 MB default */ + else { + s->ivshmem_size = ivshmem_get_size(s); + } + + register_savevm(&s->dev.qdev, "ivshmem", 0, 0, ivshmem_save, ivshmem_load, + dev); + + /* IRQFD requires MSI */ + if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD) && + !ivshmem_has_feature(s, IVSHMEM_MSI)) { + fprintf(stderr, "ivshmem: ioeventfd/irqfd requires MSI\n"); + exit(1); + } + + /* check that role is reasonable */ + if (s->role) { + if (strncmp(s->role, "peer", 5) == 0) { + s->role_val = IVSHMEM_PEER; + } else if (strncmp(s->role, "master", 7) == 0) { + s->role_val = IVSHMEM_MASTER; + } else { + fprintf(stderr, "ivshmem: 'role' must be 'peer' or 'master'\n"); + exit(1); + } + } else { + s->role_val = IVSHMEM_MASTER; /* default */ + } + + if (s->role_val == IVSHMEM_PEER) { + register_device_unmigratable(&s->dev.qdev, "ivshmem", s); + } + + pci_conf = s->dev.config; + pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT_QUMRANET); + pci_conf[0x02] = 0x10; + pci_conf[0x03] = 0x11; + pci_conf[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY; + pci_config_set_class(pci_conf, PCI_CLASS_MEMORY_RAM); + pci_conf[PCI_HEADER_TYPE] = PCI_HEADER_TYPE_NORMAL; + + pci_config_set_interrupt_pin(pci_conf, 1); + + s->shm_pci_addr = 0; + s->ivshmem_offset = 0; + s->shm_fd = 0; + + s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read, + ivshmem_mmio_write, s); + /* region for registers*/ + pci_register_bar(&s->dev, 0, IVSHMEM_REG_BAR_SIZE, + PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_mmio_map); + + if ((s->server_chr != NULL) && + (strncmp(s->server_chr->filename, "unix:", 5) == 0)) { + /* if we get a UNIX socket as the parameter we will talk + * to the ivshmem server to receive the memory region */ + + if (s->shmobj != NULL) { + fprintf(stderr, "WARNING: do not specify both 'chardev' " + "and 'shm' with ivshmem\n"); + } + + IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n", + s->server_chr->filename); + + if (ivshmem_has_feature(s, IVSHMEM_MSI)) { + ivshmem_setup_msi(s); + } + + /* we allocate enough space for 16 guests and grow as needed */ + s->nb_peers = 16; + s->vm_id = -1; + + /* allocate/initialize space for interrupt handling */ + s->peers = qemu_mallocz(s->nb_peers * sizeof(Peer)); + + pci_register_bar(&s->dev, 2, s->ivshmem_size, + PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map); + + s->eventfd_chr = qemu_mallocz(s->vectors * sizeof(CharDriverState *)); + + qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive, ivshmem_read, + ivshmem_event, s); + } else { + /* just map the file immediately, we're not using a server */ + int fd; + + if (s->shmobj == NULL) { + fprintf(stderr, "Must specify 'chardev' or 'shm' to ivshmem\n"); + } + + IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj); + + /* try opening with O_EXCL and if it succeeds zero the memory + * by truncating to 0 */ + if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL, + S_IRWXU|S_IRWXG|S_IRWXO)) > 0) { + /* truncate file to length PCI device's memory */ + if (ftruncate(fd, s->ivshmem_size) != 0) { + fprintf(stderr, "ivshmem: could not truncate shared file\n"); + } + + } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR, + S_IRWXU|S_IRWXG|S_IRWXO)) < 0) { + fprintf(stderr, "ivshmem: could not open shared file\n"); + exit(-1); + + } + + if (check_shm_size(s, fd) == -1) { + exit(-1); + } + + create_shared_memory_BAR(s, fd); + + } + + return 0; +} + +static int pci_ivshmem_uninit(PCIDevice *dev) +{ + IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev); + + cpu_unregister_io_memory(s->ivshmem_mmio_io_addr); + unregister_savevm(&dev->qdev, "ivshmem", s); + + return 0; +} + +static PCIDeviceInfo ivshmem_info = { + .qdev.name = "ivshmem", + .qdev.size = sizeof(IVShmemState), + .qdev.reset = ivshmem_reset, + .init = pci_ivshmem_init, + .exit = pci_ivshmem_uninit, + .qdev.props = (Property[]) { + DEFINE_PROP_CHR("chardev", IVShmemState, server_chr), + DEFINE_PROP_STRING("size", IVShmemState, sizearg), + DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1), + DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD, false), + DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true), + DEFINE_PROP_STRING("shm", IVShmemState, shmobj), + DEFINE_PROP_STRING("role", IVShmemState, role), + DEFINE_PROP_END_OF_LIST(), + } +}; + +static void ivshmem_register_devices(void) +{ + pci_qdev_register(&ivshmem_info); +} + +device_init(ivshmem_register_devices) diff --git a/hw/pckbd.c b/hw/pckbd.c index 0533b1d9e..6e4e4062a 100644 --- a/hw/pckbd.c +++ b/hw/pckbd.c @@ -56,7 +56,9 @@ #define KBD_CCMD_WRITE_MOUSE 0xD4 /* Write the following byte to the mouse */ #define KBD_CCMD_DISABLE_A20 0xDD /* HP vectra only ? */ #define KBD_CCMD_ENABLE_A20 0xDF /* HP vectra only ? */ -#define KBD_CCMD_RESET 0xFE +#define KBD_CCMD_PULSE_BITS_3_0 0xF0 /* Pulse bits 3-0 of the output port P2. */ +#define KBD_CCMD_RESET 0xFE /* Pulse bit 0 of the output port P2 = CPU reset. */ +#define KBD_CCMD_NO_OP 0xFF /* Pulse no bits of the output port P2. */ /* Keyboard Commands */ #define KBD_CMD_SET_LEDS 0xED /* Set keyboard leds */ @@ -238,6 +240,21 @@ static void kbd_write_command(void *opaque, uint32_t addr, uint32_t val) KBDState *s = opaque; DPRINTF("kbd: write cmd=0x%02x\n", val); + + /* Bits 3-0 of the output port P2 of the keyboard controller may be pulsed + * low for approximately 6 micro seconds. Bits 3-0 of the KBD_CCMD_PULSE + * command specify the output port bits to be pulsed. + * 0: Bit should be pulsed. 1: Bit should not be modified. + * The only useful version of this command is pulsing bit 0, + * which does a CPU reset. + */ + if((val & KBD_CCMD_PULSE_BITS_3_0) == KBD_CCMD_PULSE_BITS_3_0) { + if(!(val & 1)) + val = KBD_CCMD_RESET; + else + val = KBD_CCMD_NO_OP; + } + switch(val) { case KBD_CCMD_READ_MODE: kbd_queue(s, s->mode, 0); @@ -294,8 +311,8 @@ static void kbd_write_command(void *opaque, uint32_t addr, uint32_t val) case KBD_CCMD_RESET: qemu_system_reset_request(); break; - case 0xff: - /* ignore that - I don't know what is its use */ + case KBD_CCMD_NO_OP: + /* ignore that */ break; default: fprintf(stderr, "qemu: unsupported keyboard cmd=0x%02x\n", val); diff --git a/hw/sh_intc.c b/hw/sh_intc.c index da36d32b1..d3f5ea57d 100644 --- a/hw/sh_intc.c +++ b/hw/sh_intc.c @@ -431,9 +431,8 @@ int sh_intc_init(struct intc_desc *desc, desc->nr_prio_regs = nr_prio_regs; i = sizeof(struct intc_source) * nr_sources; - desc->sources = qemu_malloc(i); + desc->sources = qemu_mallocz(i); - memset(desc->sources, 0, i); for (i = 0; i < desc->nr_sources; i++) { struct intc_source *source = desc->sources + i; diff --git a/hw/virtio-9p-local.c b/hw/virtio-9p-local.c index 04f7f6f50..43c03c188 100644 --- a/hw/virtio-9p-local.c +++ b/hw/virtio-9p-local.c @@ -426,9 +426,6 @@ static int local_rename(FsContext *ctx, const char *oldpath, int err; tmp = qemu_strdup(rpath(ctx, oldpath)); - if (tmp == NULL) { - return -1; - } err = rename(tmp, rpath(ctx, newpath)); if (err == -1) { diff --git a/hw/virtio-9p.c b/hw/virtio-9p.c index f8c85c3d2..047c7ea4e 100644 --- a/hw/virtio-9p.c +++ b/hw/virtio-9p.c @@ -1969,9 +1969,8 @@ static void v9fs_wstat_post_chown(V9fsState *s, V9fsWstatState *vs, int err) end = old_name; } - new_name = qemu_malloc(end - old_name + vs->v9stat.name.size + 1); + new_name = qemu_mallocz(end - old_name + vs->v9stat.name.size + 1); - memset(new_name, 0, end - old_name + vs->v9stat.name.size + 1); memcpy(new_name, old_name, end - old_name); memcpy(new_name + (end - old_name), vs->v9stat.name.data, vs->v9stat.name.size); @@ -1268,6 +1268,38 @@ int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset) return r; } +int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign) +{ +#ifdef KVM_IOEVENTFD + int ret; + struct kvm_ioeventfd iofd; + + iofd.datamatch = val; + iofd.addr = addr; + iofd.len = 4; + iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH; + iofd.fd = fd; + + if (!kvm_enabled()) { + return -ENOSYS; + } + + if (!assign) { + iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; + } + + ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); + + if (ret < 0) { + return -errno; + } + + return 0; +#else + return -ENOSYS; +#endif +} + int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign) { #ifdef KVM_IOEVENTFD diff --git a/kvm-stub.c b/kvm-stub.c index 3378bd3b2..d45f9fa1d 100644 --- a/kvm-stub.c +++ b/kvm-stub.c @@ -136,3 +136,8 @@ int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign) { return -ENOSYS; } + +int kvm_set_ioeventfd_mmio_long(int fd, uint32_t adr, uint32_t val, bool assign) +{ + return -ENOSYS; +} @@ -182,6 +182,7 @@ static inline void cpu_synchronize_post_init(CPUState *env) } #endif +int kvm_set_ioeventfd_mmio_long(int fd, uint32_t adr, uint32_t val, bool assign); #if defined(KVM_IRQFD) && defined(CONFIG_KVM) int kvm_set_irqfd(int gsi, int fd, bool assigned); diff --git a/migration.c b/migration.c index a160462df..468d51749 100644 --- a/migration.c +++ b/migration.c @@ -316,8 +316,14 @@ ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size) if (ret == -1) ret = -(s->get_error(s)); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { qemu_set_fd_handler2(s->fd, NULL, NULL, migrate_fd_put_notify, s); + } else if (ret < 0) { + if (s->mon) { + monitor_resume(s->mon); + } + s->state = MIG_STATE_ERROR; + } return ret; } diff --git a/os-posix.c b/os-posix.c index 00133a0c7..6321e990c 100644 --- a/os-posix.c +++ b/os-posix.c @@ -110,7 +110,7 @@ char *os_find_datadir(const char *argv0) size_t len = sizeof(buf) - 1; *buf = '\0'; - if (!sysctl(mib, sizeof(mib)/sizeof(*mib), buf, &len, NULL, 0) && + if (!sysctl(mib, ARRAY_SIZE(mib), buf, &len, NULL, 0) && *buf) { buf[sizeof(buf) - 1] = '\0'; p = buf; diff --git a/os-win32.c b/os-win32.c index d98fd77c1..dd46bf459 100644 --- a/os-win32.c +++ b/os-win32.c @@ -34,6 +34,21 @@ #include "qemu-options.h" /***********************************************************/ +/* Functions missing in mingw */ + +int setenv(const char *name, const char *value, int overwrite) +{ + int result = 0; + if (overwrite || !getenv(name)) { + size_t length = strlen(name) + strlen(value) + 2; + char *string = qemu_malloc(length); + snprintf(string, length, "%s=%s", name, value); + result = putenv(string); + } + return result; +} + +/***********************************************************/ /* Polling handling */ typedef struct PollingEntry { @@ -95,6 +95,8 @@ int qemu_create_pidfile(const char *filename); #ifdef _WIN32 int ffs(int i); +int setenv(const char *name, const char *value, int overwrite); + typedef struct { long tv_sec; long tv_usec; diff --git a/qemu-char.c b/qemu-char.c index 9b69d928e..33f223754 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -2087,6 +2087,13 @@ static void tcp_chr_read(void *opaque) } } +#ifndef _WIN32 +CharDriverState *qemu_chr_open_eventfd(int eventfd) +{ + return qemu_chr_open_fd(eventfd, eventfd); +} +#endif + static void tcp_chr_connect(void *opaque) { CharDriverState *chr = opaque; diff --git a/qemu-char.h b/qemu-char.h index e3a07838a..6ea01ba17 100644 --- a/qemu-char.h +++ b/qemu-char.h @@ -94,6 +94,9 @@ void qemu_chr_info_print(Monitor *mon, const QObject *ret_data); void qemu_chr_info(Monitor *mon, QObject **ret_data); CharDriverState *qemu_chr_find(const char *name); +/* add an eventfd to the qemu devices that are polled */ +CharDriverState *qemu_chr_open_eventfd(int eventfd); + extern int term_escape_char; /* async I/O support */ diff --git a/qemu-config.c b/qemu-config.c index 08ee55375..562c91808 100644 --- a/qemu-config.c +++ b/qemu-config.c @@ -251,11 +251,9 @@ QemuOptsList qemu_rtc_opts = { },{ .name = "clock", .type = QEMU_OPT_STRING, -#ifdef TARGET_I386 },{ .name = "driftfix", .type = QEMU_OPT_STRING, -#endif }, { /* end if list */ } }, diff --git a/qemu-doc.texi b/qemu-doc.texi index e67bf44ff..55a966fe7 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -706,6 +706,49 @@ Using the @option{-net socket} option, it is possible to make VLANs that span several QEMU instances. See @ref{sec_invocation} to have a basic example. +@section Other Devices + +@subsection Inter-VM Shared Memory device + +With KVM enabled on a Linux host, a shared memory device is available. Guests +map a POSIX shared memory region into the guest as a PCI device that enables +zero-copy communication to the application level of the guests. The basic +syntax is: + +@example +qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>] +@end example + +If desired, interrupts can be sent between guest VMs accessing the same shared +memory region. Interrupt support requires using a shared memory server and +using a chardev socket to connect to it. The code for the shared memory server +is qemu.git/contrib/ivshmem-server. An example syntax when using the shared +memory server is: + +@example +qemu -device ivshmem,size=<size in format accepted by -m>[,chardev=<id>] + [,msi=on][,ioeventfd=on][,vectors=n][,role=peer|master] +qemu -chardev socket,path=<path>,id=<id> +@end example + +When using the server, the guest will be assigned a VM ID (>=0) that allows guests +using the same server to communicate via interrupts. Guests can read their +VM ID from a device register (see example code). Since receiving the shared +memory region from the server is asynchronous, there is a (small) chance the +guest may boot before the shared memory is attached. To allow an application +to ensure shared memory is attached, the VM ID register will return -1 (an +invalid VM ID) until the memory is attached. Once the shared memory is +attached, the VM ID will return the guest's valid VM ID. With these semantics, +the guest application can check to ensure the shared memory is attached to the +guest before proceeding. + +The @option{role} argument can be set to either master or peer and will affect +how the shared memory is migrated. With @option{role=master}, the guest will +copy the shared memory on migration to the destination host. With +@option{role=peer}, the guest will not be able to migrate with the device attached. +With the @option{peer} case, the device should be detached and then reattached +after migration using the PCI hotplug support. + @node direct_linux_boot @section Direct Linux Boot @@ -1041,6 +1041,7 @@ typedef struct SaveStateEntry { const VMStateDescription *vmsd; void *opaque; CompatEntry *compat; + int no_migrate; } SaveStateEntry; @@ -1104,6 +1105,7 @@ int register_savevm_live(DeviceState *dev, se->load_state = load_state; se->opaque = opaque; se->vmsd = NULL; + se->no_migrate = 0; if (dev && dev->parent_bus && dev->parent_bus->info->get_dev_path) { char *id = dev->parent_bus->info->get_dev_path(dev); @@ -1170,6 +1172,31 @@ void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque) } } +/* mark a device as not to be migrated, that is the device should be + unplugged before migration */ +void register_device_unmigratable(DeviceState *dev, const char *idstr, + void *opaque) +{ + SaveStateEntry *se; + char id[256] = ""; + + if (dev && dev->parent_bus && dev->parent_bus->info->get_dev_path) { + char *path = dev->parent_bus->info->get_dev_path(dev); + if (path) { + pstrcpy(id, sizeof(id), path); + pstrcat(id, sizeof(id), "/"); + qemu_free(path); + } + } + pstrcat(id, sizeof(id), idstr); + + QTAILQ_FOREACH(se, &savevm_handlers, entry) { + if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) { + se->no_migrate = 1; + } + } +} + int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, const VMStateDescription *vmsd, void *opaque, int alias_id, @@ -1376,13 +1403,19 @@ static int vmstate_load(QEMUFile *f, SaveStateEntry *se, int version_id) return vmstate_load_state(f, se->vmsd, se->opaque, version_id); } -static void vmstate_save(QEMUFile *f, SaveStateEntry *se) +static int vmstate_save(QEMUFile *f, SaveStateEntry *se) { + if (se->no_migrate) { + return -1; + } + if (!se->vmsd) { /* Old style */ se->save_state(f, se->opaque); - return; + return 0; } vmstate_save_state(f,se->vmsd, se->opaque); + + return 0; } #define QEMU_VM_FILE_MAGIC 0x5145564d @@ -1477,6 +1510,7 @@ int qemu_savevm_state_iterate(Monitor *mon, QEMUFile *f) int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f) { SaveStateEntry *se; + int r; cpu_synchronize_all_states(); @@ -1509,7 +1543,11 @@ int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f) qemu_put_be32(f, se->instance_id); qemu_put_be32(f, se->version_id); - vmstate_save(f, se); + r = vmstate_save(f, se); + if (r < 0) { + monitor_printf(mon, "cannot migrate with device '%s'\n", se->idstr); + return r; + } } qemu_put_byte(f, QEMU_VM_EOF); diff --git a/target-i386/cpuid.c b/target-i386/cpuid.c index c7ab1055c..8b2149d97 100644 --- a/target-i386/cpuid.c +++ b/target-i386/cpuid.c @@ -543,7 +543,7 @@ static int check_features_against_host(x86_def_t *guest_def) ~CPUID_EXT3_SVM, ext3_feature_name, 0x80000001}}; cpu_x86_fill_host(&host_def); - for (rv = 0, i = 0; i < sizeof (ft) / sizeof (ft[0]); ++i) + for (rv = 0, i = 0; i < ARRAY_SIZE(ft); ++i) for (mask = 1; mask; mask <<= 1) if (ft[i].check_feat & mask && *ft[i].guest_feat & mask && !(*ft[i].host_feat & mask)) { diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h index 53024282d..a1f85997c 100644 --- a/tcg/ppc/tcg-target.h +++ b/tcg/ppc/tcg-target.h @@ -65,7 +65,7 @@ enum { /* used for function call generation */ #define TCG_REG_CALL_STACK TCG_REG_R1 #define TCG_TARGET_STACK_ALIGN 16 -#if defined _CALL_DARWIN +#if defined _CALL_DARWIN || defined __APPLE__ #define TCG_TARGET_CALL_STACK_OFFSET 24 #elif defined _CALL_AIX #define TCG_TARGET_CALL_STACK_OFFSET 52 diff --git a/tcg/ppc64/tcg-target.c b/tcg/ppc64/tcg-target.c index 5ba5d053b..ebbee343f 100644 --- a/tcg/ppc64/tcg-target.c +++ b/tcg/ppc64/tcg-target.c @@ -746,7 +746,7 @@ static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc) else tcg_out32 (s, LDX | TAB (data_reg, rbase, r0)); #else if (bswap) { - tcg_out_movi32 (s, TCG_TYPE_I64, 0, 4); + tcg_out_movi32 (s, 0, 4); tcg_out32 (s, LWBRX | RT (data_reg) | RB (r0)); tcg_out32 (s, LWBRX | RT ( r1) | RA (r0)); tcg_out_rld (s, RLDIMI, data_reg, r1, 32, 0); @@ -21,6 +21,10 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ + +/* Avoid compiler warning because macro is redefined in SDL_syswm.h. */ +#undef WIN32_LEAN_AND_MEAN + #include <SDL.h> #include <SDL_syswm.h> @@ -2344,12 +2344,6 @@ int main(int argc, char **argv, char **envp) len += strlen(qemu_opt_get(opts, "security_model")); arg_fsdev = qemu_malloc((len + 1) * sizeof(*arg_fsdev)); - if (!arg_fsdev) { - fprintf(stderr, "No memory to parse -fsdev for %s\n", - optarg); - exit(1); - } - sprintf(arg_fsdev, "%s,id=%s,path=%s,security_model=%s", qemu_opt_get(opts, "fstype"), qemu_opt_get(opts, "mount_tag"), @@ -2360,12 +2354,6 @@ int main(int argc, char **argv, char **envp) len += 2*strlen(qemu_opt_get(opts, "mount_tag")); arg_9p = qemu_malloc((len + 1) * sizeof(*arg_9p)); - if (!arg_9p) { - fprintf(stderr, "No memory to parse -device for %s\n", - optarg); - exit(1); - } - sprintf(arg_9p, "virtio-9p-pci,fsdev=%s,mount_tag=%s", qemu_opt_get(opts, "mount_tag"), qemu_opt_get(opts, "mount_tag")); |