diff options
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | block.c | 2 | ||||
-rw-r--r-- | block/io.c | 2 | ||||
-rw-r--r-- | docs/devel/index.rst | 1 | ||||
-rw-r--r-- | docs/devel/secure-coding-practices.rst | 106 | ||||
-rw-r--r-- | docs/security.texi | 131 | ||||
-rw-r--r-- | qemu-doc.texi | 3 | ||||
-rw-r--r-- | util/aio-posix.c | 12 | ||||
-rw-r--r-- | util/readline.c | 174 |
9 files changed, 347 insertions, 86 deletions
@@ -976,7 +976,7 @@ qemu-doc.html qemu-doc.info qemu-doc.pdf qemu-doc.txt: \ qemu-img.texi qemu-nbd.texi qemu-options.texi qemu-option-trace.texi \ qemu-deprecated.texi qemu-monitor.texi qemu-img-cmds.texi qemu-ga.texi \ qemu-monitor-info.texi docs/qemu-block-drivers.texi \ - docs/qemu-cpu-models.texi + docs/qemu-cpu-models.texi docs/security.texi docs/interop/qemu-ga-ref.dvi docs/interop/qemu-ga-ref.html \ docs/interop/qemu-ga-ref.info docs/interop/qemu-ga-ref.pdf \ @@ -4121,7 +4121,7 @@ typedef struct CheckCo { int ret; } CheckCo; -static void bdrv_check_co_entry(void *opaque) +static void coroutine_fn bdrv_check_co_entry(void *opaque) { CheckCo *cco = opaque; cco->ret = bdrv_co_check(cco->bs, cco->res, cco->fix); diff --git a/block/io.c b/block/io.c index dfc153b8d8..0412a51314 100644 --- a/block/io.c +++ b/block/io.c @@ -1516,7 +1516,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, assert(!bs->supported_zero_flags); } - if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) { + if (ret < 0 && !(flags & BDRV_REQ_NO_FALLBACK)) { /* Fall back to bounce buffer if write zeroes is unsupported */ BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; diff --git a/docs/devel/index.rst b/docs/devel/index.rst index ebbab636ce..2a4ddf40ad 100644 --- a/docs/devel/index.rst +++ b/docs/devel/index.rst @@ -20,3 +20,4 @@ Contents: stable-process testing decodetree + secure-coding-practices diff --git a/docs/devel/secure-coding-practices.rst b/docs/devel/secure-coding-practices.rst new file mode 100644 index 0000000000..cbfc8af67e --- /dev/null +++ b/docs/devel/secure-coding-practices.rst @@ -0,0 +1,106 @@ +======================= +Secure Coding Practices +======================= +This document covers topics that both developers and security researchers must +be aware of so that they can develop safe code and audit existing code +properly. + +Reporting Security Bugs +----------------------- +For details on how to report security bugs or ask questions about potential +security bugs, see the `Security Process wiki page +<https://wiki.qemu.org/SecurityProcess>`_. + +General Secure C Coding Practices +--------------------------------- +Most CVEs (security bugs) reported against QEMU are not specific to +virtualization or emulation. They are simply C programming bugs. Therefore +it's critical to be aware of common classes of security bugs. + +There is a wide selection of resources available covering secure C coding. For +example, the `CERT C Coding Standard +<https://wiki.sei.cmu.edu/confluence/display/c/SEI+CERT+C+Coding+Standard>`_ +covers the most important classes of security bugs. + +Instead of describing them in detail here, only the names of the most important +classes of security bugs are mentioned: + +* Buffer overflows +* Use-after-free and double-free +* Integer overflows +* Format string vulnerabilities + +Some of these classes of bugs can be detected by analyzers. Static analysis is +performed regularly by Coverity and the most obvious of these bugs are even +reported by compilers. Dynamic analysis is possible with valgrind, tsan, and +asan. + +Input Validation +---------------- +Inputs from the guest or external sources (e.g. network, files) cannot be +trusted and may be invalid. Inputs must be checked before using them in a way +that could crash the program, expose host memory to the guest, or otherwise be +exploitable by an attacker. + +The most sensitive attack surface is device emulation. All hardware register +accesses and data read from guest memory must be validated. A typical example +is a device that contains multiple units that are selectable by the guest via +an index register:: + + typedef struct { + ProcessingUnit unit[2]; + ... + } MyDeviceState; + + static void mydev_writel(void *opaque, uint32_t addr, uint32_t val) + { + MyDeviceState *mydev = opaque; + ProcessingUnit *unit; + + switch (addr) { + case MYDEV_SELECT_UNIT: + unit = &mydev->unit[val]; <-- this input wasn't validated! + ... + } + } + +If ``val`` is not in range [0, 1] then an out-of-bounds memory access will take +place when ``unit`` is dereferenced. The code must check that ``val`` is 0 or +1 and handle the case where it is invalid. + +Unexpected Device Accesses +-------------------------- +The guest may access device registers in unusual orders or at unexpected +moments. Device emulation code must not assume that the guest follows the +typical "theory of operation" presented in driver writer manuals. The guest +may make nonsense accesses to device registers such as starting operations +before the device has been fully initialized. + +A related issue is that device emulation code must be prepared for unexpected +device register accesses while asynchronous operations are in progress. A +well-behaved guest might wait for a completion interrupt before accessing +certain device registers. Device emulation code must handle the case where the +guest overwrites registers or submits further requests before an ongoing +request completes. Unexpected accesses must not cause memory corruption or +leaks in QEMU. + +Invalid device register accesses can be reported with +``qemu_log_mask(LOG_GUEST_ERROR, ...)``. The ``-d guest_errors`` command-line +option enables these log messages. + +Live Migration +-------------- +Device state can be saved to disk image files and shared with other users. +Live migration code must validate inputs when loading device state so an +attacker cannot gain control by crafting invalid device states. Device state +is therefore considered untrusted even though it is typically generated by QEMU +itself. + +Guest Memory Access Races +------------------------- +Guests with multiple vCPUs may modify guest RAM while device emulation code is +running. Device emulation code must copy in descriptors and other guest RAM +structures and only process the local copy. This prevents +time-of-check-to-time-of-use (TOCTOU) race conditions that could cause QEMU to +crash when a vCPU thread modifies guest RAM while device emulation is +processing it. diff --git a/docs/security.texi b/docs/security.texi new file mode 100644 index 0000000000..927764f1e6 --- /dev/null +++ b/docs/security.texi @@ -0,0 +1,131 @@ +@node Security +@chapter Security + +@section Overview + +This chapter explains the security requirements that QEMU is designed to meet +and principles for securely deploying QEMU. + +@section Security Requirements + +QEMU supports many different use cases, some of which have stricter security +requirements than others. The community has agreed on the overall security +requirements that users may depend on. These requirements define what is +considered supported from a security perspective. + +@subsection Virtualization Use Case + +The virtualization use case covers cloud and virtual private server (VPS) +hosting, as well as traditional data center and desktop virtualization. These +use cases rely on hardware virtualization extensions to execute guest code +safely on the physical CPU at close-to-native speed. + +The following entities are untrusted, meaning that they may be buggy or +malicious: + +@itemize +@item Guest +@item User-facing interfaces (e.g. VNC, SPICE, WebSocket) +@item Network protocols (e.g. NBD, live migration) +@item User-supplied files (e.g. disk images, kernels, device trees) +@item Passthrough devices (e.g. PCI, USB) +@end itemize + +Bugs affecting these entities are evaluated on whether they can cause damage in +real-world use cases and treated as security bugs if this is the case. + +@subsection Non-virtualization Use Case + +The non-virtualization use case covers emulation using the Tiny Code Generator +(TCG). In principle the TCG and device emulation code used in conjunction with +the non-virtualization use case should meet the same security requirements as +the virtualization use case. However, for historical reasons much of the +non-virtualization use case code was not written with these security +requirements in mind. + +Bugs affecting the non-virtualization use case are not considered security +bugs at this time. Users with non-virtualization use cases must not rely on +QEMU to provide guest isolation or any security guarantees. + +@section Architecture + +This section describes the design principles that ensure the security +requirements are met. + +@subsection Guest Isolation + +Guest isolation is the confinement of guest code to the virtual machine. When +guest code gains control of execution on the host this is called escaping the +virtual machine. Isolation also includes resource limits such as throttling of +CPU, memory, disk, or network. Guests must be unable to exceed their resource +limits. + +QEMU presents an attack surface to the guest in the form of emulated devices. +The guest must not be able to gain control of QEMU. Bugs in emulated devices +could allow malicious guests to gain code execution in QEMU. At this point the +guest has escaped the virtual machine and is able to act in the context of the +QEMU process on the host. + +Guests often interact with other guests and share resources with them. A +malicious guest must not gain control of other guests or access their data. +Disk image files and network traffic must be protected from other guests unless +explicitly shared between them by the user. + +@subsection Principle of Least Privilege + +The principle of least privilege states that each component only has access to +the privileges necessary for its function. In the case of QEMU this means that +each process only has access to resources belonging to the guest. + +The QEMU process should not have access to any resources that are inaccessible +to the guest. This way the guest does not gain anything by escaping into the +QEMU process since it already has access to those same resources from within +the guest. + +Following the principle of least privilege immediately fulfills guest isolation +requirements. For example, guest A only has access to its own disk image file +@code{a.img} and not guest B's disk image file @code{b.img}. + +In reality certain resources are inaccessible to the guest but must be +available to QEMU to perform its function. For example, host system calls are +necessary for QEMU but are not exposed to guests. A guest that escapes into +the QEMU process can then begin invoking host system calls. + +New features must be designed to follow the principle of least privilege. +Should this not be possible for technical reasons, the security risk must be +clearly documented so users are aware of the trade-off of enabling the feature. + +@subsection Isolation mechanisms + +Several isolation mechanisms are available to realize this architecture of +guest isolation and the principle of least privilege. With the exception of +Linux seccomp, these mechanisms are all deployed by management tools that +launch QEMU, such as libvirt. They are also platform-specific so they are only +described briefly for Linux here. + +The fundamental isolation mechanism is that QEMU processes must run as +unprivileged users. Sometimes it seems more convenient to launch QEMU as +root to give it access to host devices (e.g. @code{/dev/net/tun}) but this poses a +huge security risk. File descriptor passing can be used to give an otherwise +unprivileged QEMU process access to host devices without running QEMU as root. +It is also possible to launch QEMU as a non-root user and configure UNIX groups +for access to @code{/dev/kvm}, @code{/dev/net/tun}, and other device nodes. +Some Linux distros already ship with UNIX groups for these devices by default. + +@itemize +@item SELinux and AppArmor make it possible to confine processes beyond the +traditional UNIX process and file permissions model. They restrict the QEMU +process from accessing processes and files on the host system that are not +needed by QEMU. + +@item Resource limits and cgroup controllers provide throughput and utilization +limits on key resources such as CPU time, memory, and I/O bandwidth. + +@item Linux namespaces can be used to make process, file system, and other system +resources unavailable to QEMU. A namespaced QEMU process is restricted to only +those resources that were granted to it. + +@item Linux seccomp is available via the QEMU @option{--sandbox} option. It disables +system calls that are not needed by QEMU, thereby reducing the host kernel +attack surface. +@end itemize diff --git a/qemu-doc.texi b/qemu-doc.texi index ae3c3f9632..577d1e8376 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -38,6 +38,7 @@ * QEMU Guest Agent:: * QEMU User space emulator:: * System requirements:: +* Security:: * Implementation notes:: * Deprecated features:: * Supported build platforms:: @@ -2878,6 +2879,8 @@ added with Linux 4.5 which is supported by the major distros. And even if RHEL7 has kernel 3.10, KVM there has the required functionality there to make it close to a 4.5 or newer kernel. +@include docs/security.texi + @include qemu-tech.texi @include qemu-deprecated.texi diff --git a/util/aio-posix.c b/util/aio-posix.c index 6fbfa7924f..db11021287 100644 --- a/util/aio-posix.c +++ b/util/aio-posix.c @@ -519,6 +519,10 @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) if (!node->deleted && node->io_poll && aio_node_check(ctx, node->is_external) && node->io_poll(node->opaque)) { + /* + * Polling was successful, exit try_poll_mode immediately + * to adjust the next polling time. + */ *timeout = 0; if (node->opaque != &ctx->notifier) { progress = true; @@ -558,8 +562,9 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) do { progress = run_poll_handlers_once(ctx, timeout); elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time; - } while (!progress && elapsed_time < max_ns - && !atomic_read(&ctx->poll_disable_cnt)); + max_ns = qemu_soonest_timeout(*timeout, max_ns); + assert(!(max_ns && progress)); + } while (elapsed_time < max_ns && !atomic_read(&ctx->poll_disable_cnt)); /* If time has passed with no successful polling, adjust *timeout to * keep the same ending time. @@ -585,8 +590,7 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) */ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) { - /* See qemu_soonest_timeout() uint64_t hack */ - int64_t max_ns = MIN((uint64_t)*timeout, (uint64_t)ctx->poll_ns); + int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); if (max_ns && !atomic_read(&ctx->poll_disable_cnt)) { poll_set_started(ctx, true); diff --git a/util/readline.c b/util/readline.c index ec91ee0fea..a7672b51c1 100644 --- a/util/readline.c +++ b/util/readline.c @@ -48,14 +48,15 @@ static void readline_update(ReadLineState *rs) if (rs->cmd_buf_size != rs->last_cmd_buf_size || memcmp(rs->cmd_buf, rs->last_cmd_buf, rs->cmd_buf_size) != 0) { - for(i = 0; i < rs->last_cmd_buf_index; i++) { + for (i = 0; i < rs->last_cmd_buf_index; i++) { rs->printf_func(rs->opaque, "\033[D"); } rs->cmd_buf[rs->cmd_buf_size] = '\0'; if (rs->read_password) { len = strlen(rs->cmd_buf); - for(i = 0; i < len; i++) + for (i = 0; i < len; i++) { rs->printf_func(rs->opaque, "*"); + } } else { rs->printf_func(rs->opaque, "%s", rs->cmd_buf); } @@ -67,12 +68,12 @@ static void readline_update(ReadLineState *rs) if (rs->cmd_buf_index != rs->last_cmd_buf_index) { delta = rs->cmd_buf_index - rs->last_cmd_buf_index; if (delta > 0) { - for(i = 0;i < delta; i++) { + for (i = 0; i < delta; i++) { rs->printf_func(rs->opaque, "\033[C"); } } else { delta = -delta; - for(i = 0;i < delta; i++) { + for (i = 0; i < delta; i++) { rs->printf_func(rs->opaque, "\033[D"); } } @@ -178,35 +179,38 @@ static void readline_up_char(ReadLineState *rs) { int idx; - if (rs->hist_entry == 0) - return; + if (rs->hist_entry == 0) { + return; + } if (rs->hist_entry == -1) { - /* Find latest entry */ - for (idx = 0; idx < READLINE_MAX_CMDS; idx++) { - if (rs->history[idx] == NULL) - break; - } - rs->hist_entry = idx; + /* Find latest entry */ + for (idx = 0; idx < READLINE_MAX_CMDS; idx++) { + if (rs->history[idx] == NULL) { + break; + } + } + rs->hist_entry = idx; } rs->hist_entry--; if (rs->hist_entry >= 0) { - pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf), + pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf), rs->history[rs->hist_entry]); - rs->cmd_buf_index = rs->cmd_buf_size = strlen(rs->cmd_buf); + rs->cmd_buf_index = rs->cmd_buf_size = strlen(rs->cmd_buf); } } static void readline_down_char(ReadLineState *rs) { - if (rs->hist_entry == -1) + if (rs->hist_entry == -1) { return; + } if (rs->hist_entry < READLINE_MAX_CMDS - 1 && rs->history[++rs->hist_entry] != NULL) { - pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf), + pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf), rs->history[rs->hist_entry]); } else { rs->cmd_buf[0] = 0; - rs->hist_entry = -1; + rs->hist_entry = -1; } rs->cmd_buf_index = rs->cmd_buf_size = strlen(rs->cmd_buf); } @@ -216,46 +220,50 @@ static void readline_hist_add(ReadLineState *rs, const char *cmdline) char *hist_entry, *new_entry; int idx; - if (cmdline[0] == '\0') - return; + if (cmdline[0] == '\0') { + return; + } new_entry = NULL; if (rs->hist_entry != -1) { - /* We were editing an existing history entry: replace it */ - hist_entry = rs->history[rs->hist_entry]; - idx = rs->hist_entry; - if (strcmp(hist_entry, cmdline) == 0) { - goto same_entry; - } + /* We were editing an existing history entry: replace it */ + hist_entry = rs->history[rs->hist_entry]; + idx = rs->hist_entry; + if (strcmp(hist_entry, cmdline) == 0) { + goto same_entry; + } } /* Search cmdline in history buffers */ for (idx = 0; idx < READLINE_MAX_CMDS; idx++) { - hist_entry = rs->history[idx]; - if (hist_entry == NULL) - break; - if (strcmp(hist_entry, cmdline) == 0) { - same_entry: - new_entry = hist_entry; - /* Put this entry at the end of history */ - memmove(&rs->history[idx], &rs->history[idx + 1], - (READLINE_MAX_CMDS - (idx + 1)) * sizeof(char *)); - rs->history[READLINE_MAX_CMDS - 1] = NULL; - for (; idx < READLINE_MAX_CMDS; idx++) { - if (rs->history[idx] == NULL) - break; - } - break; - } + hist_entry = rs->history[idx]; + if (hist_entry == NULL) { + break; + } + if (strcmp(hist_entry, cmdline) == 0) { + same_entry: + new_entry = hist_entry; + /* Put this entry at the end of history */ + memmove(&rs->history[idx], &rs->history[idx + 1], + (READLINE_MAX_CMDS - (idx + 1)) * sizeof(char *)); + rs->history[READLINE_MAX_CMDS - 1] = NULL; + for (; idx < READLINE_MAX_CMDS; idx++) { + if (rs->history[idx] == NULL) { + break; + } + } + break; + } } if (idx == READLINE_MAX_CMDS) { - /* Need to get one free slot */ + /* Need to get one free slot */ g_free(rs->history[0]); - memmove(rs->history, &rs->history[1], - (READLINE_MAX_CMDS - 1) * sizeof(char *)); - rs->history[READLINE_MAX_CMDS - 1] = NULL; - idx = READLINE_MAX_CMDS - 1; + memmove(rs->history, &rs->history[1], + (READLINE_MAX_CMDS - 1) * sizeof(char *)); + rs->history[READLINE_MAX_CMDS - 1] = NULL; + idx = READLINE_MAX_CMDS - 1; } - if (new_entry == NULL) + if (new_entry == NULL) { new_entry = g_strdup(cmdline); + } rs->history[idx] = new_entry; rs->hist_entry = -1; } @@ -297,49 +305,55 @@ static void readline_completion(ReadLineState *rs) g_free(cmdline); /* no completion found */ - if (rs->nb_completions <= 0) + if (rs->nb_completions <= 0) { return; + } if (rs->nb_completions == 1) { len = strlen(rs->completions[0]); - for(i = rs->completion_index; i < len; i++) { + for (i = rs->completion_index; i < len; i++) { readline_insert_char(rs, rs->completions[0][i]); } /* extra space for next argument. XXX: make it more generic */ - if (len > 0 && rs->completions[0][len - 1] != '/') + if (len > 0 && rs->completions[0][len - 1] != '/') { readline_insert_char(rs, ' '); + } } else { qsort(rs->completions, rs->nb_completions, sizeof(char *), completion_comp); rs->printf_func(rs->opaque, "\n"); max_width = 0; - max_prefix = 0; - for(i = 0; i < rs->nb_completions; i++) { + max_prefix = 0; + for (i = 0; i < rs->nb_completions; i++) { len = strlen(rs->completions[i]); - if (i==0) { + if (i == 0) { max_prefix = len; } else { - if (len < max_prefix) + if (len < max_prefix) { max_prefix = len; - for(j=0; j<max_prefix; j++) { - if (rs->completions[i][j] != rs->completions[0][j]) + } + for (j = 0; j < max_prefix; j++) { + if (rs->completions[i][j] != rs->completions[0][j]) { max_prefix = j; + } } } - if (len > max_width) + if (len > max_width) { max_width = len; + } } - if (max_prefix > 0) - for(i = rs->completion_index; i < max_prefix; i++) { + if (max_prefix > 0) + for (i = rs->completion_index; i < max_prefix; i++) { readline_insert_char(rs, rs->completions[0][i]); } max_width += 2; - if (max_width < 10) + if (max_width < 10) { max_width = 10; - else if (max_width > 80) + } else if (max_width > 80) { max_width = 80; + } nb_cols = 80 / max_width; j = 0; - for(i = 0; i < rs->nb_completions; i++) { + for (i = 0; i < rs->nb_completions; i++) { rs->printf_func(rs->opaque, "%-*s", max_width, rs->completions[i]); if (++j == nb_cols || i == (rs->nb_completions - 1)) { rs->printf_func(rs->opaque, "\n"); @@ -362,9 +376,9 @@ static void readline_clear_screen(ReadLineState *rs) /* return true if command handled */ void readline_handle_byte(ReadLineState *rs, int ch) { - switch(rs->esc_state) { + switch (rs->esc_state) { case IS_NORM: - switch(ch) { + switch (ch) { case 1: readline_bol(rs); break; @@ -383,8 +397,9 @@ void readline_handle_byte(ReadLineState *rs, int ch) case 10: case 13: rs->cmd_buf[rs->cmd_buf_size] = '\0'; - if (!rs->read_password) + if (!rs->read_password) { readline_hist_add(rs, rs->cmd_buf); + } rs->printf_func(rs->opaque, "\n"); rs->cmd_buf_index = 0; rs->cmd_buf_size = 0; @@ -403,9 +418,9 @@ void readline_handle_byte(ReadLineState *rs, int ch) case 8: readline_backspace(rs); break; - case 155: + case 155: rs->esc_state = IS_CSI; - break; + break; default: if (ch >= 32) { readline_insert_char(rs, ch); @@ -425,15 +440,15 @@ void readline_handle_byte(ReadLineState *rs, int ch) } break; case IS_CSI: - switch(ch) { - case 'A': - case 'F': - readline_up_char(rs); - break; - case 'B': - case 'E': - readline_down_char(rs); - break; + switch (ch) { + case 'A': + case 'F': + readline_up_char(rs); + break; + case 'B': + case 'E': + readline_down_char(rs); + break; case 'D': readline_backward_char(rs); break; @@ -444,7 +459,7 @@ void readline_handle_byte(ReadLineState *rs, int ch) rs->esc_param = rs->esc_param * 10 + (ch - '0'); goto the_end; case '~': - switch(rs->esc_param) { + switch (rs->esc_param) { case 1: readline_bol(rs); break; @@ -463,7 +478,7 @@ void readline_handle_byte(ReadLineState *rs, int ch) the_end: break; case IS_SS3: - switch(ch) { + switch (ch) { case 'F': readline_eol(rs); break; @@ -495,8 +510,9 @@ void readline_restart(ReadLineState *rs) const char *readline_get_history(ReadLineState *rs, unsigned int index) { - if (index >= READLINE_MAX_CMDS) + if (index >= READLINE_MAX_CMDS) { return NULL; + } return rs->history[index]; } |