From 4c37ef022381e777251d7084591978a4dc622efe Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 21 Jan 2013 17:09:39 +0100 Subject: host-utils: add ffsl We can provide fast versions based on the other functions defined by host-utils.h. Some care is required on glibc, which provides ffsl already. Reviewed-by: Eric Blake Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- include/qemu/host-utils.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h index 81c9a754ae..2a32be4cc0 100644 --- a/include/qemu/host-utils.h +++ b/include/qemu/host-utils.h @@ -26,6 +26,7 @@ #define HOST_UTILS_H 1 #include "qemu/compiler.h" /* QEMU_GNUC_PREREQ */ +#include /* ffsl */ #if defined(__x86_64__) #define __HAVE_FAST_MULU64__ @@ -237,4 +238,29 @@ static inline int ctpop64(uint64_t val) #endif } +/* glibc does not provide an inline version of ffsl, so always define + * ours. We need to give it a different name, however. + */ +#ifdef __GLIBC__ +#define ffsl qemu_ffsl +#endif +static inline int ffsl(long val) +{ + if (!val) { + return 0; + } + +#if QEMU_GNUC_PREREQ(3, 4) + return __builtin_ctzl(val) + 1; +#else + if (sizeof(long) == 4) { + return ctz32(val) + 1; + } else if (sizeof(long) == 8) { + return ctz64(val) + 1; + } else { + abort(); + } +#endif +} + #endif -- cgit v1.2.3 From e7c033c3fa22a1e42d9ba57fed6ddecfbce3a01c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 21 Jan 2013 17:09:40 +0100 Subject: add hierarchical bitmap data type and test cases HBitmaps provides an array of bits. The bits are stored as usual in an array of unsigned longs, but HBitmap is also optimized to provide fast iteration over set bits; going from one bit to the next is O(logB n) worst case, with B = sizeof(long) * CHAR_BIT: the result is low enough that the number of levels is in fact fixed. In order to do this, it stacks multiple bitmaps with progressively coarser granularity; in all levels except the last, bit N is set iff the N-th unsigned long is nonzero in the immediately next level. When iteration completes on the last level it can examine the 2nd-last level to quickly skip entire words, and even do so recursively to skip blocks of 64 words or powers thereof (32 on 32-bit machines). Given an index in the bitmap, it can be split in group of bits like this (for the 64-bit case): bits 0-57 => word in the last bitmap | bits 58-63 => bit in the word bits 0-51 => word in the 2nd-last bitmap | bits 52-57 => bit in the word bits 0-45 => word in the 3rd-last bitmap | bits 46-51 => bit in the word So it is easy to move up simply by shifting the index right by log2(BITS_PER_LONG) bits. To move down, you shift the index left similarly, and add the word index within the group. Iteration uses ffs (find first set bit) to find the next word to examine; this operation can be done in constant time in most current architectures. Setting or clearing a range of m bits on all levels, the work to perform is O(m + m/W + m/W^2 + ...), which is O(m) like on a regular bitmap. When iterating on a bitmap, each bit (on any level) is only visited once. Hence, The total cost of visiting a bitmap with m bits in it is the number of bits that are set in all bitmaps. Unless the bitmap is extremely sparse, this is also O(m + m/W + m/W^2 + ...), so the amortized cost of advancing from one bit to the next is usually constant. Reviewed-by: Laszlo Ersek Reviewed-by: Eric Blake Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- include/qemu/hbitmap.h | 207 +++++++++++++++++++++++++ tests/Makefile | 3 + tests/test-hbitmap.c | 408 +++++++++++++++++++++++++++++++++++++++++++++++++ trace-events | 5 + util/Makefile.objs | 2 +- util/hbitmap.c | 400 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 1024 insertions(+), 1 deletion(-) create mode 100644 include/qemu/hbitmap.h create mode 100644 tests/test-hbitmap.c create mode 100644 util/hbitmap.c (limited to 'include') diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h new file mode 100644 index 0000000000..7ddfb66808 --- /dev/null +++ b/include/qemu/hbitmap.h @@ -0,0 +1,207 @@ +/* + * Hierarchical Bitmap Data Type + * + * Copyright Red Hat, Inc., 2012 + * + * Author: Paolo Bonzini + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifndef HBITMAP_H +#define HBITMAP_H 1 + +#include +#include +#include +#include "bitops.h" + +typedef struct HBitmap HBitmap; +typedef struct HBitmapIter HBitmapIter; + +#define BITS_PER_LEVEL (BITS_PER_LONG == 32 ? 5 : 6) + +/* For 32-bit, the largest that fits in a 4 GiB address space. + * For 64-bit, the number of sectors in 1 PiB. Good luck, in + * either case... :) + */ +#define HBITMAP_LOG_MAX_SIZE (BITS_PER_LONG == 32 ? 34 : 41) + +/* We need to place a sentinel in level 0 to speed up iteration. Thus, + * we do this instead of HBITMAP_LOG_MAX_SIZE / BITS_PER_LEVEL. The + * difference is that it allocates an extra level when HBITMAP_LOG_MAX_SIZE + * is an exact multiple of BITS_PER_LEVEL. + */ +#define HBITMAP_LEVELS ((HBITMAP_LOG_MAX_SIZE / BITS_PER_LEVEL) + 1) + +struct HBitmapIter { + const HBitmap *hb; + + /* Copied from hb for access in the inline functions (hb is opaque). */ + int granularity; + + /* Entry offset into the last-level array of longs. */ + size_t pos; + + /* The currently-active path in the tree. Each item of cur[i] stores + * the bits (i.e. the subtrees) yet to be processed under that node. + */ + unsigned long cur[HBITMAP_LEVELS]; +}; + +/** + * hbitmap_alloc: + * @size: Number of bits in the bitmap. + * @granularity: Granularity of the bitmap. Aligned groups of 2^@granularity + * bits will be represented by a single bit. Each operation on a + * range of bits first rounds the bits to determine which group they land + * in, and then affect the entire set; iteration will only visit the first + * bit of each group. + * + * Allocate a new HBitmap. + */ +HBitmap *hbitmap_alloc(uint64_t size, int granularity); + +/** + * hbitmap_empty: + * @hb: HBitmap to operate on. + * + * Return whether the bitmap is empty. + */ +bool hbitmap_empty(const HBitmap *hb); + +/** + * hbitmap_granularity: + * @hb: HBitmap to operate on. + * + * Return the granularity of the HBitmap. + */ +int hbitmap_granularity(const HBitmap *hb); + +/** + * hbitmap_count: + * @hb: HBitmap to operate on. + * + * Return the number of bits set in the HBitmap. + */ +uint64_t hbitmap_count(const HBitmap *hb); + +/** + * hbitmap_set: + * @hb: HBitmap to operate on. + * @start: First bit to set (0-based). + * @count: Number of bits to set. + * + * Set a consecutive range of bits in an HBitmap. + */ +void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count); + +/** + * hbitmap_reset: + * @hb: HBitmap to operate on. + * @start: First bit to reset (0-based). + * @count: Number of bits to reset. + * + * Reset a consecutive range of bits in an HBitmap. + */ +void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count); + +/** + * hbitmap_get: + * @hb: HBitmap to operate on. + * @item: Bit to query (0-based). + * + * Return whether the @item-th bit in an HBitmap is set. + */ +bool hbitmap_get(const HBitmap *hb, uint64_t item); + +/** + * hbitmap_free: + * @hb: HBitmap to operate on. + * + * Free an HBitmap and all of its associated memory. + */ +void hbitmap_free(HBitmap *hb); + +/** + * hbitmap_iter_init: + * @hbi: HBitmapIter to initialize. + * @hb: HBitmap to iterate on. + * @first: First bit to visit (0-based). + * + * Set up @hbi to iterate on the HBitmap @hb. hbitmap_iter_next will return + * the lowest-numbered bit that is set in @hb, starting at @first. + * + * Concurrent setting of bits is acceptable, and will at worst cause the + * iteration to miss some of those bits. Resetting bits before the current + * position of the iterator is also okay. However, concurrent resetting of + * bits can lead to unexpected behavior if the iterator has not yet reached + * those bits. + */ +void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first); + +/* hbitmap_iter_skip_words: + * @hbi: HBitmapIter to operate on. + * + * Internal function used by hbitmap_iter_next and hbitmap_iter_next_word. + */ +unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi); + +/** + * hbitmap_iter_next: + * @hbi: HBitmapIter to operate on. + * + * Return the next bit that is set in @hbi's associated HBitmap, + * or -1 if all remaining bits are zero. + */ +static inline int64_t hbitmap_iter_next(HBitmapIter *hbi) +{ + unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1]; + int64_t item; + + if (cur == 0) { + cur = hbitmap_iter_skip_words(hbi); + if (cur == 0) { + return -1; + } + } + + /* The next call will resume work from the next bit. */ + hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1); + item = ((uint64_t)hbi->pos << BITS_PER_LEVEL) + ffsl(cur) - 1; + + return item << hbi->granularity; +} + +/** + * hbitmap_iter_next_word: + * @hbi: HBitmapIter to operate on. + * @p_cur: Location where to store the next non-zero word. + * + * Return the index of the next nonzero word that is set in @hbi's + * associated HBitmap, and set *p_cur to the content of that word + * (bits before the index that was passed to hbitmap_iter_init are + * trimmed on the first call). Return -1, and set *p_cur to zero, + * if all remaining words are zero. + */ +static inline size_t hbitmap_iter_next_word(HBitmapIter *hbi, unsigned long *p_cur) +{ + unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1]; + + if (cur == 0) { + cur = hbitmap_iter_skip_words(hbi); + if (cur == 0) { + *p_cur = 0; + return -1; + } + } + + /* The next call will resume work from the next word. */ + hbi->cur[HBITMAP_LEVELS - 1] = 0; + *p_cur = cur; + return hbi->pos; +} + + +#endif diff --git a/tests/Makefile b/tests/Makefile index d86e95a400..b3a6d8697b 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -45,6 +45,8 @@ gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c check-unit-y += tests/test-thread-pool$(EXESUF) gcov-files-test-thread-pool-y = thread-pool.c +gcov-files-test-hbitmap-y = util/hbitmap.c +check-unit-y += tests/test-hbitmap$(EXESUF) check-block-$(CONFIG_POSIX) += tests/qemu-iotests-quick.sh @@ -86,6 +88,7 @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(block-obj-y) libqemuutil tests/test-aio$(EXESUF): tests/test-aio.o $(block-obj-y) libqemuutil.a libqemustub.a tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(block-obj-y) libqemuutil.a libqemustub.a tests/test-iov$(EXESUF): tests/test-iov.o libqemuutil.a +tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o libqemuutil.a libqemustub.a tests/test-qapi-types.c tests/test-qapi-types.h :\ $(SRC_PATH)/qapi-schema-test.json $(SRC_PATH)/scripts/qapi-types.py diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c new file mode 100644 index 0000000000..fcc6a00fc3 --- /dev/null +++ b/tests/test-hbitmap.c @@ -0,0 +1,408 @@ +/* + * Hierarchical bitmap unit-tests. + * + * Copyright (C) 2012 Red Hat Inc. + * + * Author: Paolo Bonzini + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include +#include +#include "qemu/hbitmap.h" + +#define LOG_BITS_PER_LONG (BITS_PER_LONG == 32 ? 5 : 6) + +#define L1 BITS_PER_LONG +#define L2 (BITS_PER_LONG * L1) +#define L3 (BITS_PER_LONG * L2) + +typedef struct TestHBitmapData { + HBitmap *hb; + unsigned long *bits; + size_t size; + int granularity; +} TestHBitmapData; + + +/* Check that the HBitmap and the shadow bitmap contain the same data, + * ignoring the same "first" bits. + */ +static void hbitmap_test_check(TestHBitmapData *data, + uint64_t first) +{ + uint64_t count = 0; + size_t pos; + int bit; + HBitmapIter hbi; + int64_t i, next; + + hbitmap_iter_init(&hbi, data->hb, first); + + i = first; + for (;;) { + next = hbitmap_iter_next(&hbi); + if (next < 0) { + next = data->size; + } + + while (i < next) { + pos = i >> LOG_BITS_PER_LONG; + bit = i & (BITS_PER_LONG - 1); + i++; + g_assert_cmpint(data->bits[pos] & (1UL << bit), ==, 0); + } + + if (next == data->size) { + break; + } + + pos = i >> LOG_BITS_PER_LONG; + bit = i & (BITS_PER_LONG - 1); + i++; + count++; + g_assert_cmpint(data->bits[pos] & (1UL << bit), !=, 0); + } + + if (first == 0) { + g_assert_cmpint(count << data->granularity, ==, hbitmap_count(data->hb)); + } +} + +/* This is provided instead of a test setup function so that the sizes + are kept in the test functions (and not in main()) */ +static void hbitmap_test_init(TestHBitmapData *data, + uint64_t size, int granularity) +{ + size_t n; + data->hb = hbitmap_alloc(size, granularity); + + n = (size + BITS_PER_LONG - 1) / BITS_PER_LONG; + if (n == 0) { + n = 1; + } + data->bits = g_new0(unsigned long, n); + data->size = size; + data->granularity = granularity; + hbitmap_test_check(data, 0); +} + +static void hbitmap_test_teardown(TestHBitmapData *data, + const void *unused) +{ + if (data->hb) { + hbitmap_free(data->hb); + data->hb = NULL; + } + if (data->bits) { + g_free(data->bits); + data->bits = NULL; + } +} + +/* Set a range in the HBitmap and in the shadow "simple" bitmap. + * The two bitmaps are then tested against each other. + */ +static void hbitmap_test_set(TestHBitmapData *data, + uint64_t first, uint64_t count) +{ + hbitmap_set(data->hb, first, count); + while (count-- != 0) { + size_t pos = first >> LOG_BITS_PER_LONG; + int bit = first & (BITS_PER_LONG - 1); + first++; + + data->bits[pos] |= 1UL << bit; + } + + if (data->granularity == 0) { + hbitmap_test_check(data, 0); + } +} + +/* Reset a range in the HBitmap and in the shadow "simple" bitmap. + */ +static void hbitmap_test_reset(TestHBitmapData *data, + uint64_t first, uint64_t count) +{ + hbitmap_reset(data->hb, first, count); + while (count-- != 0) { + size_t pos = first >> LOG_BITS_PER_LONG; + int bit = first & (BITS_PER_LONG - 1); + first++; + + data->bits[pos] &= ~(1UL << bit); + } + + if (data->granularity == 0) { + hbitmap_test_check(data, 0); + } +} + +static void hbitmap_test_check_get(TestHBitmapData *data) +{ + uint64_t count = 0; + uint64_t i; + + for (i = 0; i < data->size; i++) { + size_t pos = i >> LOG_BITS_PER_LONG; + int bit = i & (BITS_PER_LONG - 1); + unsigned long val = data->bits[pos] & (1UL << bit); + count += hbitmap_get(data->hb, i); + g_assert_cmpint(hbitmap_get(data->hb, i), ==, val != 0); + } + g_assert_cmpint(count, ==, hbitmap_count(data->hb)); +} + +static void test_hbitmap_zero(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, 0, 0); +} + +static void test_hbitmap_unaligned(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3 + 23, 0); + hbitmap_test_set(data, 0, 1); + hbitmap_test_set(data, L3 + 22, 1); +} + +static void test_hbitmap_iter_empty(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L1, 0); +} + +static void test_hbitmap_iter_partial(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3, 0); + hbitmap_test_set(data, 0, L3); + hbitmap_test_check(data, 1); + hbitmap_test_check(data, L1 - 1); + hbitmap_test_check(data, L1); + hbitmap_test_check(data, L1 * 2 - 1); + hbitmap_test_check(data, L2 - 1); + hbitmap_test_check(data, L2); + hbitmap_test_check(data, L2 + 1); + hbitmap_test_check(data, L2 + L1); + hbitmap_test_check(data, L2 + L1 * 2 - 1); + hbitmap_test_check(data, L2 * 2 - 1); + hbitmap_test_check(data, L2 * 2); + hbitmap_test_check(data, L2 * 2 + 1); + hbitmap_test_check(data, L2 * 2 + L1); + hbitmap_test_check(data, L2 * 2 + L1 * 2 - 1); + hbitmap_test_check(data, L3 / 2); +} + +static void test_hbitmap_iter_past(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3, 0); + hbitmap_test_set(data, 0, L3); + hbitmap_test_check(data, L3); +} + +static void test_hbitmap_set_all(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3, 0); + hbitmap_test_set(data, 0, L3); +} + +static void test_hbitmap_get_all(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3, 0); + hbitmap_test_set(data, 0, L3); + hbitmap_test_check_get(data); +} + +static void test_hbitmap_get_some(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, 2 * L2, 0); + hbitmap_test_set(data, 10, 1); + hbitmap_test_check_get(data); + hbitmap_test_set(data, L1 - 1, 1); + hbitmap_test_check_get(data); + hbitmap_test_set(data, L1, 1); + hbitmap_test_check_get(data); + hbitmap_test_set(data, L2 - 1, 1); + hbitmap_test_check_get(data); + hbitmap_test_set(data, L2, 1); + hbitmap_test_check_get(data); +} + +static void test_hbitmap_set_one(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, 2 * L2, 0); + hbitmap_test_set(data, 10, 1); + hbitmap_test_set(data, L1 - 1, 1); + hbitmap_test_set(data, L1, 1); + hbitmap_test_set(data, L2 - 1, 1); + hbitmap_test_set(data, L2, 1); +} + +static void test_hbitmap_set_two_elem(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, 2 * L2, 0); + hbitmap_test_set(data, L1 - 1, 2); + hbitmap_test_set(data, L1 * 2 - 1, 4); + hbitmap_test_set(data, L1 * 4, L1 + 1); + hbitmap_test_set(data, L1 * 8 - 1, L1 + 1); + hbitmap_test_set(data, L2 - 1, 2); + hbitmap_test_set(data, L2 + L1 - 1, 8); + hbitmap_test_set(data, L2 + L1 * 4, L1 + 1); + hbitmap_test_set(data, L2 + L1 * 8 - 1, L1 + 1); +} + +static void test_hbitmap_set(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3 * 2, 0); + hbitmap_test_set(data, L1 - 1, L1 + 2); + hbitmap_test_set(data, L1 * 3 - 1, L1 + 2); + hbitmap_test_set(data, L1 * 5, L1 * 2 + 1); + hbitmap_test_set(data, L1 * 8 - 1, L1 * 2 + 1); + hbitmap_test_set(data, L2 - 1, L1 + 2); + hbitmap_test_set(data, L2 + L1 * 2 - 1, L1 + 2); + hbitmap_test_set(data, L2 + L1 * 4, L1 * 2 + 1); + hbitmap_test_set(data, L2 + L1 * 7 - 1, L1 * 2 + 1); + hbitmap_test_set(data, L2 * 2 - 1, L3 * 2 - L2 * 2); +} + +static void test_hbitmap_set_twice(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L1 * 3, 0); + hbitmap_test_set(data, 0, L1 * 3); + hbitmap_test_set(data, L1, 1); +} + +static void test_hbitmap_set_overlap(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3 * 2, 0); + hbitmap_test_set(data, L1 - 1, L1 + 2); + hbitmap_test_set(data, L1 * 2 - 1, L1 * 2 + 2); + hbitmap_test_set(data, 0, L1 * 3); + hbitmap_test_set(data, L1 * 8 - 1, L2); + hbitmap_test_set(data, L2, L1); + hbitmap_test_set(data, L2 - L1 - 1, L1 * 8 + 2); + hbitmap_test_set(data, L2, L3 - L2 + 1); + hbitmap_test_set(data, L3 - L1, L1 * 3); + hbitmap_test_set(data, L3 - 1, 3); + hbitmap_test_set(data, L3 - 1, L2); +} + +static void test_hbitmap_reset_empty(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3, 0); + hbitmap_test_reset(data, 0, L3); +} + +static void test_hbitmap_reset(TestHBitmapData *data, + const void *unused) +{ + hbitmap_test_init(data, L3 * 2, 0); + hbitmap_test_set(data, L1 - 1, L1 + 2); + hbitmap_test_reset(data, L1 * 2 - 1, L1 * 2 + 2); + hbitmap_test_set(data, 0, L1 * 3); + hbitmap_test_reset(data, L1 * 8 - 1, L2); + hbitmap_test_set(data, L2, L1); + hbitmap_test_reset(data, L2 - L1 - 1, L1 * 8 + 2); + hbitmap_test_set(data, L2, L3 - L2 + 1); + hbitmap_test_reset(data, L3 - L1, L1 * 3); + hbitmap_test_set(data, L3 - 1, 3); + hbitmap_test_reset(data, L3 - 1, L2); + hbitmap_test_set(data, 0, L3 * 2); + hbitmap_test_reset(data, 0, L1); + hbitmap_test_reset(data, 0, L2); + hbitmap_test_reset(data, L3, L3); + hbitmap_test_set(data, L3 / 2, L3); +} + +static void test_hbitmap_granularity(TestHBitmapData *data, + const void *unused) +{ + /* Note that hbitmap_test_check has to be invoked manually in this test. */ + hbitmap_test_init(data, L1, 1); + hbitmap_test_set(data, 0, 1); + g_assert_cmpint(hbitmap_count(data->hb), ==, 2); + hbitmap_test_check(data, 0); + hbitmap_test_set(data, 2, 1); + g_assert_cmpint(hbitmap_count(data->hb), ==, 4); + hbitmap_test_check(data, 0); + hbitmap_test_set(data, 0, 3); + g_assert_cmpint(hbitmap_count(data->hb), ==, 4); + hbitmap_test_reset(data, 0, 1); + g_assert_cmpint(hbitmap_count(data->hb), ==, 2); +} + +static void test_hbitmap_iter_granularity(TestHBitmapData *data, + const void *unused) +{ + HBitmapIter hbi; + + /* Note that hbitmap_test_check has to be invoked manually in this test. */ + hbitmap_test_init(data, 131072 << 7, 7); + hbitmap_iter_init(&hbi, data->hb, 0); + g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); + + hbitmap_test_set(data, ((L2 + L1 + 1) << 7) + 8, 8); + hbitmap_iter_init(&hbi, data->hb, 0); + g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7); + g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); + + hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7); + g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); + + hbitmap_test_set(data, (131072 << 7) - 8, 8); + hbitmap_iter_init(&hbi, data->hb, 0); + g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7); + g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7); + g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); + + hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7); + g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7); + g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0); +} + +static void hbitmap_test_add(const char *testpath, + void (*test_func)(TestHBitmapData *data, const void *user_data)) +{ + g_test_add(testpath, TestHBitmapData, NULL, NULL, test_func, + hbitmap_test_teardown); +} + +int main(int argc, char **argv) +{ + g_test_init(&argc, &argv, NULL); + hbitmap_test_add("/hbitmap/size/0", test_hbitmap_zero); + hbitmap_test_add("/hbitmap/size/unaligned", test_hbitmap_unaligned); + hbitmap_test_add("/hbitmap/iter/empty", test_hbitmap_iter_empty); + hbitmap_test_add("/hbitmap/iter/past", test_hbitmap_iter_past); + hbitmap_test_add("/hbitmap/iter/partial", test_hbitmap_iter_partial); + hbitmap_test_add("/hbitmap/iter/granularity", test_hbitmap_iter_granularity); + hbitmap_test_add("/hbitmap/get/all", test_hbitmap_get_all); + hbitmap_test_add("/hbitmap/get/some", test_hbitmap_get_some); + hbitmap_test_add("/hbitmap/set/all", test_hbitmap_set_all); + hbitmap_test_add("/hbitmap/set/one", test_hbitmap_set_one); + hbitmap_test_add("/hbitmap/set/two-elem", test_hbitmap_set_two_elem); + hbitmap_test_add("/hbitmap/set/general", test_hbitmap_set); + hbitmap_test_add("/hbitmap/set/twice", test_hbitmap_set_twice); + hbitmap_test_add("/hbitmap/set/overlap", test_hbitmap_set_overlap); + hbitmap_test_add("/hbitmap/reset/empty", test_hbitmap_reset_empty); + hbitmap_test_add("/hbitmap/reset/general", test_hbitmap_reset); + hbitmap_test_add("/hbitmap/granularity", test_hbitmap_granularity); + g_test_run(); + + return 0; +} diff --git a/trace-events b/trace-events index 09091e6d17..732cb12a00 100644 --- a/trace-events +++ b/trace-events @@ -1060,3 +1060,8 @@ xics_set_irq_lsi(int srcno, int nr) "set_irq_lsi: srcno %d [irq %#x]" xics_ics_write_xive(int nr, int srcno, int server, uint8_t priority) "ics_write_xive: irq %#x [src %d] server %#x prio %#x" xics_ics_reject(int nr, int srcno) "reject irq %#x [src %d]" xics_ics_eoi(int nr) "ics_eoi: irq %#x" + +# hbitmap.c +hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx" +hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64 +hbitmap_set(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64 diff --git a/util/Makefile.objs b/util/Makefile.objs index 5baeb53af6..495a178557 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -2,7 +2,7 @@ util-obj-y = osdep.o cutils.o qemu-timer-common.o util-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o event_notifier-win32.o util-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o event_notifier-posix.o util-obj-y += envlist.o path.o host-utils.o cache-utils.o module.o -util-obj-y += bitmap.o bitops.o +util-obj-y += bitmap.o bitops.o hbitmap.o util-obj-y += acl.o util-obj-y += error.o qemu-error.o util-obj-$(CONFIG_POSIX) += compatfd.o diff --git a/util/hbitmap.c b/util/hbitmap.c new file mode 100644 index 0000000000..fb7e01e8c5 --- /dev/null +++ b/util/hbitmap.c @@ -0,0 +1,400 @@ +/* + * Hierarchical Bitmap Data Type + * + * Copyright Red Hat, Inc., 2012 + * + * Author: Paolo Bonzini + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include "qemu/osdep.h" +#include "qemu/hbitmap.h" +#include "qemu/host-utils.h" +#include "trace.h" + +/* HBitmaps provides an array of bits. The bits are stored as usual in an + * array of unsigned longs, but HBitmap is also optimized to provide fast + * iteration over set bits; going from one bit to the next is O(logB n) + * worst case, with B = sizeof(long) * CHAR_BIT: the result is low enough + * that the number of levels is in fact fixed. + * + * In order to do this, it stacks multiple bitmaps with progressively coarser + * granularity; in all levels except the last, bit N is set iff the N-th + * unsigned long is nonzero in the immediately next level. When iteration + * completes on the last level it can examine the 2nd-last level to quickly + * skip entire words, and even do so recursively to skip blocks of 64 words or + * powers thereof (32 on 32-bit machines). + * + * Given an index in the bitmap, it can be split in group of bits like + * this (for the 64-bit case): + * + * bits 0-57 => word in the last bitmap | bits 58-63 => bit in the word + * bits 0-51 => word in the 2nd-last bitmap | bits 52-57 => bit in the word + * bits 0-45 => word in the 3rd-last bitmap | bits 46-51 => bit in the word + * + * So it is easy to move up simply by shifting the index right by + * log2(BITS_PER_LONG) bits. To move down, you shift the index left + * similarly, and add the word index within the group. Iteration uses + * ffs (find first set bit) to find the next word to examine; this + * operation can be done in constant time in most current architectures. + * + * Setting or clearing a range of m bits on all levels, the work to perform + * is O(m + m/W + m/W^2 + ...), which is O(m) like on a regular bitmap. + * + * When iterating on a bitmap, each bit (on any level) is only visited + * once. Hence, The total cost of visiting a bitmap with m bits in it is + * the number of bits that are set in all bitmaps. Unless the bitmap is + * extremely sparse, this is also O(m + m/W + m/W^2 + ...), so the amortized + * cost of advancing from one bit to the next is usually constant (worst case + * O(logB n) as in the non-amortized complexity). + */ + +struct HBitmap { + /* Number of total bits in the bottom level. */ + uint64_t size; + + /* Number of set bits in the bottom level. */ + uint64_t count; + + /* A scaling factor. Given a granularity of G, each bit in the bitmap will + * will actually represent a group of 2^G elements. Each operation on a + * range of bits first rounds the bits to determine which group they land + * in, and then affect the entire page; iteration will only visit the first + * bit of each group. Here is an example of operations in a size-16, + * granularity-1 HBitmap: + * + * initial state 00000000 + * set(start=0, count=9) 11111000 (iter: 0, 2, 4, 6, 8) + * reset(start=1, count=3) 00111000 (iter: 4, 6, 8) + * set(start=9, count=2) 00111100 (iter: 4, 6, 8, 10) + * reset(start=5, count=5) 00000000 + * + * From an implementation point of view, when setting or resetting bits, + * the bitmap will scale bit numbers right by this amount of bits. When + * iterating, the bitmap will scale bit numbers left by this amount of + * bits. + */ + int granularity; + + /* A number of progressively less coarse bitmaps (i.e. level 0 is the + * coarsest). Each bit in level N represents a word in level N+1 that + * has a set bit, except the last level where each bit represents the + * actual bitmap. + * + * Note that all bitmaps have the same number of levels. Even a 1-bit + * bitmap will still allocate HBITMAP_LEVELS arrays. + */ + unsigned long *levels[HBITMAP_LEVELS]; +}; + +static inline int popcountl(unsigned long l) +{ + return BITS_PER_LONG == 32 ? ctpop32(l) : ctpop64(l); +} + +/* Advance hbi to the next nonzero word and return it. hbi->pos + * is updated. Returns zero if we reach the end of the bitmap. + */ +unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi) +{ + size_t pos = hbi->pos; + const HBitmap *hb = hbi->hb; + unsigned i = HBITMAP_LEVELS - 1; + + unsigned long cur; + do { + cur = hbi->cur[--i]; + pos >>= BITS_PER_LEVEL; + } while (cur == 0); + + /* Check for end of iteration. We always use fewer than BITS_PER_LONG + * bits in the level 0 bitmap; thus we can repurpose the most significant + * bit as a sentinel. The sentinel is set in hbitmap_alloc and ensures + * that the above loop ends even without an explicit check on i. + */ + + if (i == 0 && cur == (1UL << (BITS_PER_LONG - 1))) { + return 0; + } + for (; i < HBITMAP_LEVELS - 1; i++) { + /* Shift back pos to the left, matching the right shifts above. + * The index of this word's least significant set bit provides + * the low-order bits. + */ + pos = (pos << BITS_PER_LEVEL) + ffsl(cur) - 1; + hbi->cur[i] = cur & (cur - 1); + + /* Set up next level for iteration. */ + cur = hb->levels[i + 1][pos]; + } + + hbi->pos = pos; + trace_hbitmap_iter_skip_words(hbi->hb, hbi, pos, cur); + + assert(cur); + return cur; +} + +void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first) +{ + unsigned i, bit; + uint64_t pos; + + hbi->hb = hb; + pos = first >> hb->granularity; + hbi->pos = pos >> BITS_PER_LEVEL; + hbi->granularity = hb->granularity; + + for (i = HBITMAP_LEVELS; i-- > 0; ) { + bit = pos & (BITS_PER_LONG - 1); + pos >>= BITS_PER_LEVEL; + + /* Drop bits representing items before first. */ + hbi->cur[i] = hb->levels[i][pos] & ~((1UL << bit) - 1); + + /* We have already added level i+1, so the lowest set bit has + * been processed. Clear it. + */ + if (i != HBITMAP_LEVELS - 1) { + hbi->cur[i] &= ~(1UL << bit); + } + } +} + +bool hbitmap_empty(const HBitmap *hb) +{ + return hb->count == 0; +} + +int hbitmap_granularity(const HBitmap *hb) +{ + return hb->granularity; +} + +uint64_t hbitmap_count(const HBitmap *hb) +{ + return hb->count << hb->granularity; +} + +/* Count the number of set bits between start and end, not accounting for + * the granularity. Also an example of how to use hbitmap_iter_next_word. + */ +static uint64_t hb_count_between(HBitmap *hb, uint64_t start, uint64_t last) +{ + HBitmapIter hbi; + uint64_t count = 0; + uint64_t end = last + 1; + unsigned long cur; + size_t pos; + + hbitmap_iter_init(&hbi, hb, start << hb->granularity); + for (;;) { + pos = hbitmap_iter_next_word(&hbi, &cur); + if (pos >= (end >> BITS_PER_LEVEL)) { + break; + } + count += popcountl(cur); + } + + if (pos == (end >> BITS_PER_LEVEL)) { + /* Drop bits representing the END-th and subsequent items. */ + int bit = end & (BITS_PER_LONG - 1); + cur &= (1UL << bit) - 1; + count += popcountl(cur); + } + + return count; +} + +/* Setting starts at the last layer and propagates up if an element + * changes from zero to non-zero. + */ +static inline bool hb_set_elem(unsigned long *elem, uint64_t start, uint64_t last) +{ + unsigned long mask; + bool changed; + + assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL)); + assert(start <= last); + + mask = 2UL << (last & (BITS_PER_LONG - 1)); + mask -= 1UL << (start & (BITS_PER_LONG - 1)); + changed = (*elem == 0); + *elem |= mask; + return changed; +} + +/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */ +static void hb_set_between(HBitmap *hb, int level, uint64_t start, uint64_t last) +{ + size_t pos = start >> BITS_PER_LEVEL; + size_t lastpos = last >> BITS_PER_LEVEL; + bool changed = false; + size_t i; + + i = pos; + if (i < lastpos) { + uint64_t next = (start | (BITS_PER_LONG - 1)) + 1; + changed |= hb_set_elem(&hb->levels[level][i], start, next - 1); + for (;;) { + start = next; + next += BITS_PER_LONG; + if (++i == lastpos) { + break; + } + changed |= (hb->levels[level][i] == 0); + hb->levels[level][i] = ~0UL; + } + } + changed |= hb_set_elem(&hb->levels[level][i], start, last); + + /* If there was any change in this layer, we may have to update + * the one above. + */ + if (level > 0 && changed) { + hb_set_between(hb, level - 1, pos, lastpos); + } +} + +void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count) +{ + /* Compute range in the last layer. */ + uint64_t last = start + count - 1; + + trace_hbitmap_set(hb, start, count, + start >> hb->granularity, last >> hb->granularity); + + start >>= hb->granularity; + last >>= hb->granularity; + count = last - start + 1; + + hb->count += count - hb_count_between(hb, start, last); + hb_set_between(hb, HBITMAP_LEVELS - 1, start, last); +} + +/* Resetting works the other way round: propagate up if the new + * value is zero. + */ +static inline bool hb_reset_elem(unsigned long *elem, uint64_t start, uint64_t last) +{ + unsigned long mask; + bool blanked; + + assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL)); + assert(start <= last); + + mask = 2UL << (last & (BITS_PER_LONG - 1)); + mask -= 1UL << (start & (BITS_PER_LONG - 1)); + blanked = *elem != 0 && ((*elem & ~mask) == 0); + *elem &= ~mask; + return blanked; +} + +/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */ +static void hb_reset_between(HBitmap *hb, int level, uint64_t start, uint64_t last) +{ + size_t pos = start >> BITS_PER_LEVEL; + size_t lastpos = last >> BITS_PER_LEVEL; + bool changed = false; + size_t i; + + i = pos; + if (i < lastpos) { + uint64_t next = (start | (BITS_PER_LONG - 1)) + 1; + + /* Here we need a more complex test than when setting bits. Even if + * something was changed, we must not blank bits in the upper level + * unless the lower-level word became entirely zero. So, remove pos + * from the upper-level range if bits remain set. + */ + if (hb_reset_elem(&hb->levels[level][i], start, next - 1)) { + changed = true; + } else { + pos++; + } + + for (;;) { + start = next; + next += BITS_PER_LONG; + if (++i == lastpos) { + break; + } + changed |= (hb->levels[level][i] != 0); + hb->levels[level][i] = 0UL; + } + } + + /* Same as above, this time for lastpos. */ + if (hb_reset_elem(&hb->levels[level][i], start, last)) { + changed = true; + } else { + lastpos--; + } + + if (level > 0 && changed) { + hb_reset_between(hb, level - 1, pos, lastpos); + } +} + +void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count) +{ + /* Compute range in the last layer. */ + uint64_t last = start + count - 1; + + trace_hbitmap_reset(hb, start, count, + start >> hb->granularity, last >> hb->granularity); + + start >>= hb->granularity; + last >>= hb->granularity; + + hb->count -= hb_count_between(hb, start, last); + hb_reset_between(hb, HBITMAP_LEVELS - 1, start, last); +} + +bool hbitmap_get(const HBitmap *hb, uint64_t item) +{ + /* Compute position and bit in the last layer. */ + uint64_t pos = item >> hb->granularity; + unsigned long bit = 1UL << (pos & (BITS_PER_LONG - 1)); + + return (hb->levels[HBITMAP_LEVELS - 1][pos >> BITS_PER_LEVEL] & bit) != 0; +} + +void hbitmap_free(HBitmap *hb) +{ + unsigned i; + for (i = HBITMAP_LEVELS; i-- > 0; ) { + g_free(hb->levels[i]); + } + g_free(hb); +} + +HBitmap *hbitmap_alloc(uint64_t size, int granularity) +{ + HBitmap *hb = g_malloc0(sizeof (struct HBitmap)); + unsigned i; + + assert(granularity >= 0 && granularity < 64); + size = (size + (1ULL << granularity) - 1) >> granularity; + assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE)); + + hb->size = size; + hb->granularity = granularity; + for (i = HBITMAP_LEVELS; i-- > 0; ) { + size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1); + hb->levels[i] = g_malloc0(size * sizeof(unsigned long)); + } + + /* We necessarily have free bits in level 0 due to the definition + * of HBITMAP_LEVELS, so use one for a sentinel. This speeds up + * hbitmap_iter_skip_words. + */ + assert(size == 1); + hb->levels[0][0] |= 1UL << (BITS_PER_LONG - 1); + return hb; +} -- cgit v1.2.3 From 8f0720ecbc3677e13fc7531588fc3831cc972ee4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 21 Jan 2013 17:09:41 +0100 Subject: block: implement dirty bitmap using HBitmap This actually uses the dirty bitmap in the block layer, and converts mirroring to use an HBitmapIter. Reviewed-by: Laszlo Ersek (except block/mirror.c parts) Reviewed-by: Eric Blake Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- block.c | 96 ++++++++--------------------------------------- block/mirror.c | 12 +++++- include/block/block.h | 6 ++- include/block/block_int.h | 4 +- trace-events | 1 + 5 files changed, 33 insertions(+), 86 deletions(-) (limited to 'include') diff --git a/block.c b/block.c index 6fa7c90144..e0ff736403 100644 --- a/block.c +++ b/block.c @@ -1286,7 +1286,6 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest, bs_dest->iostatus = bs_src->iostatus; /* dirty bitmap */ - bs_dest->dirty_count = bs_src->dirty_count; bs_dest->dirty_bitmap = bs_src->dirty_bitmap; /* job */ @@ -2035,36 +2034,6 @@ int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, return ret; } -#define BITS_PER_LONG (sizeof(unsigned long) * 8) - -static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int dirty) -{ - int64_t start, end; - unsigned long val, idx, bit; - - start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; - end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; - - for (; start <= end; start++) { - idx = start / BITS_PER_LONG; - bit = start % BITS_PER_LONG; - val = bs->dirty_bitmap[idx]; - if (dirty) { - if (!(val & (1UL << bit))) { - bs->dirty_count++; - val |= 1UL << bit; - } - } else { - if (val & (1UL << bit)) { - bs->dirty_count--; - val &= ~(1UL << bit); - } - } - bs->dirty_bitmap[idx] = val; - } -} - /* Return < 0 if error. Important errors are: -EIO generic I/O error (may happen for all errors) -ENOMEDIUM No media inserted. @@ -4173,7 +4142,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, } if (bs->dirty_bitmap) { - set_dirty_bitmap(bs, sector_num, nb_sectors, 0); + bdrv_reset_dirty(bs, sector_num, nb_sectors); } if (bs->drv->bdrv_co_discard) { @@ -4335,18 +4304,15 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable) { int64_t bitmap_size; - bs->dirty_count = 0; if (enable) { if (!bs->dirty_bitmap) { - bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) + - BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1; - bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG; - - bs->dirty_bitmap = g_new0(unsigned long, bitmap_size); + bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS); + bs->dirty_bitmap = hbitmap_alloc(bitmap_size, + BDRV_LOG_SECTORS_PER_DIRTY_CHUNK); } } else { if (bs->dirty_bitmap) { - g_free(bs->dirty_bitmap); + hbitmap_free(bs->dirty_bitmap); bs->dirty_bitmap = NULL; } } @@ -4354,67 +4320,37 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable) int bdrv_get_dirty(BlockDriverState *bs, int64_t sector) { - int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; - - if (bs->dirty_bitmap && - (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) { - return !!(bs->dirty_bitmap[chunk / BITS_PER_LONG] & - (1UL << (chunk % BITS_PER_LONG))); + if (bs->dirty_bitmap) { + return hbitmap_get(bs->dirty_bitmap, sector); } else { return 0; } } -int64_t bdrv_get_next_dirty(BlockDriverState *bs, int64_t sector) +void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi) { - int64_t chunk; - int bit, elem; - - /* Avoid an infinite loop. */ - assert(bs->dirty_count > 0); - - sector = (sector | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1; - chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; - - QEMU_BUILD_BUG_ON(sizeof(bs->dirty_bitmap[0]) * 8 != BITS_PER_LONG); - elem = chunk / BITS_PER_LONG; - bit = chunk % BITS_PER_LONG; - for (;;) { - if (sector >= bs->total_sectors) { - sector = 0; - bit = elem = 0; - } - if (bit == 0 && bs->dirty_bitmap[elem] == 0) { - sector += BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG; - elem++; - } else { - if (bs->dirty_bitmap[elem] & (1UL << bit)) { - return sector; - } - sector += BDRV_SECTORS_PER_DIRTY_CHUNK; - if (++bit == BITS_PER_LONG) { - bit = 0; - elem++; - } - } - } + hbitmap_iter_init(hbi, bs->dirty_bitmap, 0); } void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) { - set_dirty_bitmap(bs, cur_sector, nr_sectors, 1); + hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors); } void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) { - set_dirty_bitmap(bs, cur_sector, nr_sectors, 0); + hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors); } int64_t bdrv_get_dirty_count(BlockDriverState *bs) { - return bs->dirty_count; + if (bs->dirty_bitmap) { + return hbitmap_count(bs->dirty_bitmap) >> BDRV_LOG_SECTORS_PER_DIRTY_CHUNK; + } else { + return 0; + } } void bdrv_set_in_use(BlockDriverState *bs, int in_use) diff --git a/block/mirror.c b/block/mirror.c index 6180aa30e5..20cb1e777f 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -36,6 +36,7 @@ typedef struct MirrorBlockJob { bool synced; bool should_complete; int64_t sector_num; + HBitmapIter hbi; uint8_t *buf; } MirrorBlockJob; @@ -62,8 +63,15 @@ static int coroutine_fn mirror_iteration(MirrorBlockJob *s, int64_t end; struct iovec iov; + s->sector_num = hbitmap_iter_next(&s->hbi); + if (s->sector_num < 0) { + bdrv_dirty_iter_init(source, &s->hbi); + s->sector_num = hbitmap_iter_next(&s->hbi); + trace_mirror_restart_iter(s, bdrv_get_dirty_count(source)); + assert(s->sector_num >= 0); + } + end = s->common.len >> BDRV_SECTOR_BITS; - s->sector_num = bdrv_get_next_dirty(source, s->sector_num); nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - s->sector_num); bdrv_reset_dirty(source, s->sector_num, nb_sectors); @@ -136,7 +144,7 @@ static void coroutine_fn mirror_run(void *opaque) } } - s->sector_num = -1; + bdrv_dirty_iter_init(bs, &s->hbi); for (;;) { uint64_t delay_ns; int64_t cnt; diff --git a/include/block/block.h b/include/block/block.h index ffd193637d..678fc60dc1 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -351,13 +351,15 @@ void bdrv_set_buffer_alignment(BlockDriverState *bs, int align); void *qemu_blockalign(BlockDriverState *bs, size_t size); bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); -#define BDRV_SECTORS_PER_DIRTY_CHUNK 2048 +#define BDRV_SECTORS_PER_DIRTY_CHUNK (1 << BDRV_LOG_SECTORS_PER_DIRTY_CHUNK) +#define BDRV_LOG_SECTORS_PER_DIRTY_CHUNK 11 +struct HBitmapIter; void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable); int bdrv_get_dirty(BlockDriverState *bs, int64_t sector); void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors); void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors); -int64_t bdrv_get_next_dirty(BlockDriverState *bs, int64_t sector); +void bdrv_dirty_iter_init(BlockDriverState *bs, struct HBitmapIter *hbi); int64_t bdrv_get_dirty_count(BlockDriverState *bs); void bdrv_enable_copy_on_read(BlockDriverState *bs); diff --git a/include/block/block_int.h b/include/block/block_int.h index f83ffb8a08..b81c061cd9 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -32,6 +32,7 @@ #include "qapi-types.h" #include "qapi/qmp/qerror.h" #include "monitor/monitor.h" +#include "qemu/hbitmap.h" #define BLOCK_FLAG_ENCRYPT 1 #define BLOCK_FLAG_COMPAT6 4 @@ -275,8 +276,7 @@ struct BlockDriverState { bool iostatus_enabled; BlockDeviceIoStatus iostatus; char device_name[32]; - unsigned long *dirty_bitmap; - int64_t dirty_count; + HBitmap *dirty_bitmap; int in_use; /* users other than guest access, eg. block migration */ QTAILQ_ENTRY(BlockDriverState) list; diff --git a/trace-events b/trace-events index 732cb12a00..61ed349765 100644 --- a/trace-events +++ b/trace-events @@ -79,6 +79,7 @@ commit_start(void *bs, void *base, void *top, void *s, void *co, void *opaque) " # block/mirror.c mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p opaque %p" +mirror_restart_iter(void *s, int64_t cnt) "s %p dirty count %"PRId64 mirror_before_flush(void *s) "s %p" mirror_before_drain(void *s, int64_t cnt) "s %p dirty count %"PRId64 mirror_before_sleep(void *s, int64_t cnt, int synced) "s %p dirty count %"PRId64" synced %d" -- cgit v1.2.3 From 343bded4ecfc467012e2ab675da75749f1d90f70 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 21 Jan 2013 17:09:42 +0100 Subject: block: make round_to_clusters public This is needed in the following patch. Reviewed-by: Laszlo Ersek Reviewed-by: Eric Blake Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- block.c | 16 ++++++++-------- include/block/block.h | 4 ++++ 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/block.c b/block.c index e0ff736403..4a4ab162f7 100644 --- a/block.c +++ b/block.c @@ -1673,10 +1673,10 @@ static void tracked_request_begin(BdrvTrackedRequest *req, /** * Round a region to cluster boundaries */ -static void round_to_clusters(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - int64_t *cluster_sector_num, - int *cluster_nb_sectors) +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors) { BlockDriverInfo bdi; @@ -1718,8 +1718,8 @@ static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, * CoR read and write operations are atomic and guest writes cannot * interleave between them. */ - round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); + bdrv_round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); do { retry = false; @@ -2185,8 +2185,8 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, /* Cover entire cluster so no additional backing file I/O is required when * allocating cluster in the image file. */ - round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); + bdrv_round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, cluster_sector_num, cluster_nb_sectors); diff --git a/include/block/block.h b/include/block/block.h index 678fc60dc1..9ee9068aec 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -309,6 +309,10 @@ int bdrv_get_flags(BlockDriverState *bs); int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors); int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi); +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors); const char *bdrv_get_encrypted_filename(BlockDriverState *bs); void bdrv_get_backing_filename(BlockDriverState *bs, -- cgit v1.2.3 From 50717e941b9f306a45292621999eeafbaa954418 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 21 Jan 2013 17:09:45 +0100 Subject: block: allow customizing the granularity of the dirty bitmap Reviewed-by: Eric Blake Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- block-migration.c | 5 +++-- block.c | 17 ++++++++++------- block/mirror.c | 14 ++++---------- include/block/block.h | 5 +---- qapi-schema.json | 4 +++- 5 files changed, 21 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/block-migration.c b/block-migration.c index 9d0b037029..9ac7de6d78 100644 --- a/block-migration.c +++ b/block-migration.c @@ -23,7 +23,8 @@ #include "sysemu/blockdev.h" #include -#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS) +#define BLOCK_SIZE (1 << 20) +#define BDRV_SECTORS_PER_DIRTY_CHUNK (BLOCK_SIZE >> BDRV_SECTOR_BITS) #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 #define BLK_MIG_FLAG_EOS 0x02 @@ -254,7 +255,7 @@ static void set_dirty_tracking(int enable) BlkMigDevState *bmds; QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - bdrv_set_dirty_tracking(bmds->bs, enable); + bdrv_set_dirty_tracking(bmds->bs, enable ? BLOCK_SIZE : 0); } } diff --git a/block.c b/block.c index a27454453e..ba67c0def2 100644 --- a/block.c +++ b/block.c @@ -2833,6 +2833,8 @@ BlockInfo *bdrv_query_info(BlockDriverState *bs) info->has_dirty = true; info->dirty = g_malloc0(sizeof(*info->dirty)); info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE; + info->dirty->granularity = + ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap)); } if (bs->drv) { @@ -4299,16 +4301,17 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) return true; } -void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable) +void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity) { int64_t bitmap_size; - if (enable) { - if (!bs->dirty_bitmap) { - bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS); - bs->dirty_bitmap = hbitmap_alloc(bitmap_size, - BDRV_LOG_SECTORS_PER_DIRTY_CHUNK); - } + assert((granularity & (granularity - 1)) == 0); + + if (granularity) { + granularity >>= BDRV_SECTOR_BITS; + assert(!bs->dirty_bitmap); + bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS); + bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); } else { if (bs->dirty_bitmap) { hbitmap_free(bs->dirty_bitmap); diff --git a/block/mirror.c b/block/mirror.c index 7884b3bc78..e425927ee5 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -17,14 +17,8 @@ #include "qemu/ratelimit.h" #include "qemu/bitmap.h" -enum { - /* - * Size of data buffer for populating the image file. This should be large - * enough to process multiple clusters in a single call, so that populating - * contiguous regions of the image is efficient. - */ - BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */ -}; +#define BLOCK_SIZE (1 << 20) +#define BDRV_SECTORS_PER_DIRTY_CHUNK (BLOCK_SIZE >> BDRV_SECTOR_BITS) #define SLICE_TIME 100000000ULL /* ns */ @@ -276,7 +270,7 @@ static void coroutine_fn mirror_run(void *opaque) immediate_exit: qemu_vfree(s->buf); g_free(s->cow_bitmap); - bdrv_set_dirty_tracking(bs, false); + bdrv_set_dirty_tracking(bs, 0); bdrv_iostatus_disable(s->target); if (s->should_complete && ret == 0) { if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) { @@ -364,7 +358,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, s->mode = mode; s->buf_size = BLOCK_SIZE; - bdrv_set_dirty_tracking(bs, true); + bdrv_set_dirty_tracking(bs, BLOCK_SIZE); bdrv_set_enable_write_cache(s->target, true); bdrv_set_on_error(s->target, on_target_error, on_target_error); bdrv_iostatus_enable(s->target); diff --git a/include/block/block.h b/include/block/block.h index 9ee9068aec..5c3b911c1b 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -355,11 +355,8 @@ void bdrv_set_buffer_alignment(BlockDriverState *bs, int align); void *qemu_blockalign(BlockDriverState *bs, size_t size); bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); -#define BDRV_SECTORS_PER_DIRTY_CHUNK (1 << BDRV_LOG_SECTORS_PER_DIRTY_CHUNK) -#define BDRV_LOG_SECTORS_PER_DIRTY_CHUNK 11 - struct HBitmapIter; -void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable); +void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity); int bdrv_get_dirty(BlockDriverState *bs, int64_t sector); void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors); void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors); diff --git a/qapi-schema.json b/qapi-schema.json index 6d7252b9e8..ce4f901dc6 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -667,10 +667,12 @@ # # @count: number of dirty bytes according to the dirty bitmap # +# @granularity: granularity of the dirty bitmap in bytes (since 1.4) +# # Since: 1.3 ## { 'type': 'BlockDirtyInfo', - 'data': {'count': 'int'} } + 'data': {'count': 'int', 'granularity': 'int'} } ## # @BlockInfo: -- cgit v1.2.3 From eee13dfe302833944d1176677d12a6ea421a94ea Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 21 Jan 2013 17:09:46 +0100 Subject: mirror: allow customizing the granularity The desired granularity may be very different depending on the kind of operation (e.g. continuous replication vs. collapse-to-raw) and whether the VM is expected to perform lots of I/O while mirroring is in progress. Allow the user to customize it, while providing a sane default so that in general there will be no extra allocated space in the target compared to the source. Reviewed-by: Eric Blake Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- block/mirror.c | 52 ++++++++++++++++++++++++++++++----------------- blockdev.c | 15 +++++++++++++- hmp.c | 2 +- include/block/block_int.h | 3 ++- qapi-schema.json | 8 +++++++- qmp-commands.hx | 8 +++++++- 6 files changed, 64 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/block/mirror.c b/block/mirror.c index e425927ee5..0fecb40c10 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -17,9 +17,6 @@ #include "qemu/ratelimit.h" #include "qemu/bitmap.h" -#define BLOCK_SIZE (1 << 20) -#define BDRV_SECTORS_PER_DIRTY_CHUNK (BLOCK_SIZE >> BDRV_SECTOR_BITS) - #define SLICE_TIME 100000000ULL /* ns */ typedef struct MirrorBlockJob { @@ -31,6 +28,7 @@ typedef struct MirrorBlockJob { bool synced; bool should_complete; int64_t sector_num; + int64_t granularity; size_t buf_size; unsigned long *cow_bitmap; HBitmapIter hbi; @@ -56,7 +54,7 @@ static int coroutine_fn mirror_iteration(MirrorBlockJob *s, BlockDriverState *source = s->common.bs; BlockDriverState *target = s->target; QEMUIOVector qiov; - int ret, nb_sectors; + int ret, nb_sectors, sectors_per_chunk; int64_t end, sector_num, chunk_num; struct iovec iov; @@ -72,16 +70,16 @@ static int coroutine_fn mirror_iteration(MirrorBlockJob *s, * is very large, we need to do COW ourselves. The first time a cluster is * copied, copy it entirely. * - * Because both BDRV_SECTORS_PER_DIRTY_CHUNK and the cluster size are - * powers of two, the number of sectors to copy cannot exceed one cluster. + * Because both the granularity and the cluster size are powers of two, the + * number of sectors to copy cannot exceed one cluster. */ sector_num = s->sector_num; - nb_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; - chunk_num = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; + sectors_per_chunk = nb_sectors = s->granularity >> BDRV_SECTOR_BITS; + chunk_num = sector_num / sectors_per_chunk; if (s->cow_bitmap && !test_bit(chunk_num, s->cow_bitmap)) { trace_mirror_cow(s, sector_num); bdrv_round_to_clusters(s->target, - sector_num, BDRV_SECTORS_PER_DIRTY_CHUNK, + sector_num, sectors_per_chunk, §or_num, &nb_sectors); } @@ -107,8 +105,8 @@ static int coroutine_fn mirror_iteration(MirrorBlockJob *s, goto fail; } if (s->cow_bitmap) { - bitmap_set(s->cow_bitmap, sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK, - nb_sectors / BDRV_SECTORS_PER_DIRTY_CHUNK); + bitmap_set(s->cow_bitmap, sector_num / sectors_per_chunk, + nb_sectors / sectors_per_chunk); } return 0; @@ -122,7 +120,7 @@ static void coroutine_fn mirror_run(void *opaque) { MirrorBlockJob *s = opaque; BlockDriverState *bs = s->common.bs; - int64_t sector_num, end, length; + int64_t sector_num, end, sectors_per_chunk, length; BlockDriverInfo bdi; char backing_filename[1024]; int ret = 0; @@ -146,22 +144,23 @@ static void coroutine_fn mirror_run(void *opaque) sizeof(backing_filename)); if (backing_filename[0] && !s->target->backing_hd) { bdrv_get_info(s->target, &bdi); - if (s->buf_size < bdi.cluster_size) { + if (s->granularity < bdi.cluster_size) { s->buf_size = bdi.cluster_size; - length = (bdrv_getlength(bs) + BLOCK_SIZE - 1) / BLOCK_SIZE; + length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity; s->cow_bitmap = bitmap_new(length); } } end = s->common.len >> BDRV_SECTOR_BITS; s->buf = qemu_blockalign(bs, s->buf_size); + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; if (s->mode != MIRROR_SYNC_MODE_NONE) { /* First part, loop on the sectors and initialize the dirty bitmap. */ BlockDriverState *base; base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd; for (sector_num = 0; sector_num < end; ) { - int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1; + int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1; ret = bdrv_co_is_allocated_above(bs, base, sector_num, next - sector_num, &n); @@ -242,7 +241,7 @@ static void coroutine_fn mirror_run(void *opaque) s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE; if (s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, BDRV_SECTORS_PER_DIRTY_CHUNK); + delay_ns = ratelimit_calculate_delay(&s->limit, sectors_per_chunk); } else { delay_ns = 0; } @@ -332,7 +331,7 @@ static BlockJobType mirror_job_type = { }; void mirror_start(BlockDriverState *bs, BlockDriverState *target, - int64_t speed, MirrorSyncMode mode, + int64_t speed, int64_t granularity, MirrorSyncMode mode, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockDriverCompletionFunc *cb, @@ -340,6 +339,20 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, { MirrorBlockJob *s; + if (granularity == 0) { + /* Choose the default granularity based on the target file's cluster + * size, clamped between 4k and 64k. */ + BlockDriverInfo bdi; + if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) { + granularity = MAX(4096, bdi.cluster_size); + granularity = MIN(65536, granularity); + } else { + granularity = 65536; + } + } + + assert ((granularity & (granularity - 1)) == 0); + if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && !bdrv_iostatus_is_enabled(bs)) { @@ -356,9 +369,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, s->on_target_error = on_target_error; s->target = target; s->mode = mode; - s->buf_size = BLOCK_SIZE; + s->granularity = granularity; + s->buf_size = granularity; - bdrv_set_dirty_tracking(bs, BLOCK_SIZE); + bdrv_set_dirty_tracking(bs, granularity); bdrv_set_enable_write_cache(s->target, true); bdrv_set_on_error(s->target, on_target_error, on_target_error); bdrv_iostatus_enable(s->target); diff --git a/blockdev.c b/blockdev.c index 1eb62b637c..07fd3273ed 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1193,6 +1193,7 @@ void qmp_drive_mirror(const char *device, const char *target, enum MirrorSyncMode sync, bool has_mode, enum NewImageMode mode, bool has_speed, int64_t speed, + bool has_granularity, uint32_t granularity, bool has_on_source_error, BlockdevOnError on_source_error, bool has_on_target_error, BlockdevOnError on_target_error, Error **errp) @@ -1218,6 +1219,17 @@ void qmp_drive_mirror(const char *device, const char *target, if (!has_mode) { mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS; } + if (!has_granularity) { + granularity = 0; + } + if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) { + error_set(errp, QERR_INVALID_PARAMETER, device); + return; + } + if (granularity & (granularity - 1)) { + error_set(errp, QERR_INVALID_PARAMETER, device); + return; + } bs = bdrv_find(device); if (!bs) { @@ -1299,7 +1311,8 @@ void qmp_drive_mirror(const char *device, const char *target, return; } - mirror_start(bs, target_bs, speed, sync, on_source_error, on_target_error, + mirror_start(bs, target_bs, speed, granularity, sync, + on_source_error, on_target_error, block_job_cb, bs, &local_err); if (local_err != NULL) { bdrv_delete(target_bs); diff --git a/hmp.c b/hmp.c index c7b6ba02fc..0f3347dd76 100644 --- a/hmp.c +++ b/hmp.c @@ -796,7 +796,7 @@ void hmp_drive_mirror(Monitor *mon, const QDict *qdict) qmp_drive_mirror(device, filename, !!format, format, full ? MIRROR_SYNC_MODE_FULL : MIRROR_SYNC_MODE_TOP, - true, mode, false, 0, + true, mode, false, 0, false, 0, false, 0, false, 0, &errp); hmp_handle_error(mon, &errp); } diff --git a/include/block/block_int.h b/include/block/block_int.h index b81c061cd9..1165339fd1 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -344,6 +344,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, * @bs: Block device to operate on. * @target: Block device to write to. * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @granularity: The chosen granularity for the dirty bitmap. * @mode: Whether to collapse all images in the chain to the target. * @on_source_error: The action to take upon error reading from the source. * @on_target_error: The action to take upon error writing to the target. @@ -357,7 +358,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, * @bs will be switched to read from @target. */ void mirror_start(BlockDriverState *bs, BlockDriverState *target, - int64_t speed, MirrorSyncMode mode, + int64_t speed, int64_t granularity, MirrorSyncMode mode, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockDriverCompletionFunc *cb, diff --git a/qapi-schema.json b/qapi-schema.json index ce4f901dc6..fd5ec93c03 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -1636,6 +1636,11 @@ # (all the disk, only the sectors allocated in the topmost image, or # only new I/O). # +# @granularity: #optional granularity of the dirty bitmap, default is 64K +# if the image format doesn't have clusters, 4K if the clusters +# are smaller than that, else the cluster size. Must be a +# power of 2 between 512 and 64M (since 1.4). +# # @on-source-error: #optional the action to take on an error on the source, # default 'report'. 'stop' and 'enospc' can only be used # if the block device supports io-status (see BlockInfo). @@ -1652,7 +1657,8 @@ { 'command': 'drive-mirror', 'data': { 'device': 'str', 'target': 'str', '*format': 'str', 'sync': 'MirrorSyncMode', '*mode': 'NewImageMode', - '*speed': 'int', '*on-source-error': 'BlockdevOnError', + '*speed': 'int', '*granularity': 'uint32', + '*on-source-error': 'BlockdevOnError', '*on-target-error': 'BlockdevOnError' } } ## diff --git a/qmp-commands.hx b/qmp-commands.hx index cbf12804be..835ea26e9d 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -938,7 +938,8 @@ EQMP { .name = "drive-mirror", .args_type = "sync:s,device:B,target:s,speed:i?,mode:s?,format:s?," - "on-source-error:s?,on-target-error:s?", + "on-source-error:s?,on-target-error:s?," + "granularity:i?", .mhandler.cmd_new = qmp_marshal_input_drive_mirror, }, @@ -962,6 +963,7 @@ Arguments: file/device (NewImageMode, optional, default 'absolute-paths') - "speed": maximum speed of the streaming job, in bytes per second (json-int) +- "granularity": granularity of the dirty bitmap, in bytes (json-int, optional) - "sync": what parts of the disk image should be copied to the destination; possibilities include "full" for all the disk, "top" for only the sectors allocated in the topmost image, or "none" to only replicate new I/O @@ -971,6 +973,10 @@ Arguments: - "on-target-error": the action to take on an error on the target (BlockdevOnError, default 'report') +The default value of the granularity is the image cluster size clamped +between 4096 and 65536, if the image format defines one. If the format +does not define a cluster size, the default value of the granularity +is 65536. Example: -- cgit v1.2.3 From 08e4ed6cdeeee7912072cf14aa8ab6c60dacb4fb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 22 Jan 2013 09:03:13 +0100 Subject: mirror: add buf-size argument to drive-mirror This makes sense when the next commit starts using the extra buffer space to perform many I/O operations asynchronously. Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- block/mirror.c | 8 ++++---- blockdev.c | 9 ++++++++- hmp.c | 2 +- include/block/block_int.h | 5 +++-- qapi-schema.json | 5 ++++- qmp-commands.hx | 4 +++- tests/qemu-iotests/041 | 31 +++++++++++++++++++++++++++++++ tests/qemu-iotests/041.out | 4 ++-- 8 files changed, 56 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/block/mirror.c b/block/mirror.c index fc6b9b7118..896972c297 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -207,7 +207,7 @@ static void coroutine_fn mirror_run(void *opaque) if (backing_filename[0] && !s->target->backing_hd) { bdrv_get_info(s->target, &bdi); if (s->granularity < bdi.cluster_size) { - s->buf_size = bdi.cluster_size; + s->buf_size = MAX(s->buf_size, bdi.cluster_size); length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity; s->cow_bitmap = bitmap_new(length); } @@ -416,8 +416,8 @@ static BlockJobType mirror_job_type = { }; void mirror_start(BlockDriverState *bs, BlockDriverState *target, - int64_t speed, int64_t granularity, MirrorSyncMode mode, - BlockdevOnError on_source_error, + int64_t speed, int64_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockDriverCompletionFunc *cb, void *opaque, Error **errp) @@ -455,7 +455,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, s->target = target; s->mode = mode; s->granularity = granularity; - s->buf_size = granularity; + s->buf_size = MAX(buf_size, granularity); bdrv_set_dirty_tracking(bs, granularity); bdrv_set_enable_write_cache(s->target, true); diff --git a/blockdev.c b/blockdev.c index 07fd3273ed..ad25b9b86e 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1188,12 +1188,15 @@ void qmp_block_commit(const char *device, drive_get_ref(drive_get_by_blockdev(bs)); } +#define DEFAULT_MIRROR_BUF_SIZE (10 << 20) + void qmp_drive_mirror(const char *device, const char *target, bool has_format, const char *format, enum MirrorSyncMode sync, bool has_mode, enum NewImageMode mode, bool has_speed, int64_t speed, bool has_granularity, uint32_t granularity, + bool has_buf_size, int64_t buf_size, bool has_on_source_error, BlockdevOnError on_source_error, bool has_on_target_error, BlockdevOnError on_target_error, Error **errp) @@ -1222,6 +1225,10 @@ void qmp_drive_mirror(const char *device, const char *target, if (!has_granularity) { granularity = 0; } + if (!has_buf_size) { + buf_size = DEFAULT_MIRROR_BUF_SIZE; + } + if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) { error_set(errp, QERR_INVALID_PARAMETER, device); return; @@ -1311,7 +1318,7 @@ void qmp_drive_mirror(const char *device, const char *target, return; } - mirror_start(bs, target_bs, speed, granularity, sync, + mirror_start(bs, target_bs, speed, granularity, buf_size, sync, on_source_error, on_target_error, block_job_cb, bs, &local_err); if (local_err != NULL) { diff --git a/hmp.c b/hmp.c index 0f3347dd76..99fd89206b 100644 --- a/hmp.c +++ b/hmp.c @@ -796,7 +796,7 @@ void hmp_drive_mirror(Monitor *mon, const QDict *qdict) qmp_drive_mirror(device, filename, !!format, format, full ? MIRROR_SYNC_MODE_FULL : MIRROR_SYNC_MODE_TOP, - true, mode, false, 0, false, 0, + true, mode, false, 0, false, 0, false, 0, false, 0, false, 0, &errp); hmp_handle_error(mon, &errp); } diff --git a/include/block/block_int.h b/include/block/block_int.h index 1165339fd1..f7279b978a 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -345,6 +345,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, * @target: Block device to write to. * @speed: The maximum speed, in bytes per second, or 0 for unlimited. * @granularity: The chosen granularity for the dirty bitmap. + * @buf_size: The amount of data that can be in flight at one time. * @mode: Whether to collapse all images in the chain to the target. * @on_source_error: The action to take upon error reading from the source. * @on_target_error: The action to take upon error writing to the target. @@ -358,8 +359,8 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, * @bs will be switched to read from @target. */ void mirror_start(BlockDriverState *bs, BlockDriverState *target, - int64_t speed, int64_t granularity, MirrorSyncMode mode, - BlockdevOnError on_source_error, + int64_t speed, int64_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockDriverCompletionFunc *cb, void *opaque, Error **errp); diff --git a/qapi-schema.json b/qapi-schema.json index fd5ec93c03..ba75c4de12 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -1641,6 +1641,9 @@ # are smaller than that, else the cluster size. Must be a # power of 2 between 512 and 64M (since 1.4). # +# @buf-size: #optional maximum amount of data in flight from source to +# target (since 1.4). +# # @on-source-error: #optional the action to take on an error on the source, # default 'report'. 'stop' and 'enospc' can only be used # if the block device supports io-status (see BlockInfo). @@ -1658,7 +1661,7 @@ 'data': { 'device': 'str', 'target': 'str', '*format': 'str', 'sync': 'MirrorSyncMode', '*mode': 'NewImageMode', '*speed': 'int', '*granularity': 'uint32', - '*on-source-error': 'BlockdevOnError', + '*buf-size': 'int', '*on-source-error': 'BlockdevOnError', '*on-target-error': 'BlockdevOnError' } } ## diff --git a/qmp-commands.hx b/qmp-commands.hx index 835ea26e9d..273b4a67ba 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -939,7 +939,7 @@ EQMP .name = "drive-mirror", .args_type = "sync:s,device:B,target:s,speed:i?,mode:s?,format:s?," "on-source-error:s?,on-target-error:s?," - "granularity:i?", + "granularity:i?,buf-size:i?", .mhandler.cmd_new = qmp_marshal_input_drive_mirror, }, @@ -964,6 +964,8 @@ Arguments: - "speed": maximum speed of the streaming job, in bytes per second (json-int) - "granularity": granularity of the dirty bitmap, in bytes (json-int, optional) +- "buf_size": maximum amount of data in flight from source to target, in bytes + (json-int, default 10M) - "sync": what parts of the disk image should be copied to the destination; possibilities include "full" for all the disk, "top" for only the sectors allocated in the topmost image, or "none" to only replicate new I/O diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index a1299b348e..b040820c51 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -207,6 +207,37 @@ class TestSingleDrive(ImageMirroringTestCase): self.assertTrue(self.compare_images(test_img, target_img), 'target image does not match source after mirroring') + def test_small_buffer(self): + self.assert_no_active_mirrors() + + # A small buffer is rounded up automatically + result = self.vm.qmp('drive-mirror', device='drive0', sync='full', + buf_size=4096, target=target_img) + self.assert_qmp(result, 'return', {}) + + self.complete_and_wait() + result = self.vm.qmp('query-block') + self.assert_qmp(result, 'return[0]/inserted/file', target_img) + self.vm.shutdown() + self.assertTrue(self.compare_images(test_img, target_img), + 'target image does not match source after mirroring') + + def test_small_buffer2(self): + self.assert_no_active_mirrors() + + qemu_img('create', '-f', iotests.imgfmt, '-o', 'cluster_size=%d,size=%d' + % (TestSingleDrive.image_len, TestSingleDrive.image_len), target_img) + result = self.vm.qmp('drive-mirror', device='drive0', sync='full', + buf_size=65536, mode='existing', target=target_img) + self.assert_qmp(result, 'return', {}) + + self.complete_and_wait() + result = self.vm.qmp('query-block') + self.assert_qmp(result, 'return[0]/inserted/file', target_img) + self.vm.shutdown() + self.assertTrue(self.compare_images(test_img, target_img), + 'target image does not match source after mirroring') + def test_large_cluster(self): self.assert_no_active_mirrors() diff --git a/tests/qemu-iotests/041.out b/tests/qemu-iotests/041.out index 3a89159833..84bfd63fba 100644 --- a/tests/qemu-iotests/041.out +++ b/tests/qemu-iotests/041.out @@ -1,5 +1,5 @@ -.................... +...................... ---------------------------------------------------------------------- -Ran 20 tests +Ran 22 tests OK -- cgit v1.2.3 From 02582abd48aa3d860015e9a8fcd0d7ec1c34ec62 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Thu, 17 Jan 2013 21:45:24 +0100 Subject: block: Add special error code for wrong format The block drivers need a special error code for "wrong format". From the available error codes EMEDIUMTYPE fits best. It is not available on all platforms, so a definition in qemu-common.h and a specific error report are needed. Signed-off-by: Stefan Weil Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- blockdev.c | 9 +++++++-- include/qemu-common.h | 3 +++ 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/blockdev.c b/blockdev.c index ad25b9b86e..ac396f3c94 100644 --- a/blockdev.c +++ b/blockdev.c @@ -617,8 +617,13 @@ DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type) ret = bdrv_open(dinfo->bdrv, file, bdrv_flags, drv); if (ret < 0) { - error_report("could not open disk image %s: %s", - file, strerror(-ret)); + if (ret == -EMEDIUMTYPE) { + error_report("could not open disk image %s: not in %s format", + file, drv->format_name); + } else { + error_report("could not open disk image %s: %s", + file, strerror(-ret)); + } goto err; } diff --git a/include/qemu-common.h b/include/qemu-common.h index ca464bb367..af2379ff38 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -68,6 +68,9 @@ #if !defined(ECANCELED) #define ECANCELED 4097 #endif +#if !defined(EMEDIUMTYPE) +#define EMEDIUMTYPE 4098 +#endif #ifndef TIME_MAX #define TIME_MAX LONG_MAX #endif -- cgit v1.2.3 From 1b0952445522af73b0e78420a9078b3653923703 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 22 Jan 2013 15:01:12 +0100 Subject: hbitmap: add assertion on hbitmap_iter_init hbitmap_iter_init causes an out-of-bounds access when the "first" argument is or greater than or equal to the size of the bitmap. Forbid this with an assertion, and remove the failing testcase. Reported-by: Kevin Wolf Signed-off-by: Paolo Bonzini Reviewed-by: Eric Blake Reviewed-by: Laszlo Ersek Signed-off-by: Kevin Wolf --- include/qemu/hbitmap.h | 3 ++- tests/test-hbitmap.c | 13 +++---------- util/hbitmap.c | 1 + 3 files changed, 6 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h index 7ddfb66808..73f5d1d8d3 100644 --- a/include/qemu/hbitmap.h +++ b/include/qemu/hbitmap.h @@ -128,7 +128,8 @@ void hbitmap_free(HBitmap *hb); * hbitmap_iter_init: * @hbi: HBitmapIter to initialize. * @hb: HBitmap to iterate on. - * @first: First bit to visit (0-based). + * @first: First bit to visit (0-based, must be strictly less than the + * size of the bitmap). * * Set up @hbi to iterate on the HBitmap @hb. hbitmap_iter_next will return * the lowest-numbered bit that is set in @hb, starting at @first. diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c index fcc6a00fc3..8c902f2055 100644 --- a/tests/test-hbitmap.c +++ b/tests/test-hbitmap.c @@ -86,7 +86,9 @@ static void hbitmap_test_init(TestHBitmapData *data, data->bits = g_new0(unsigned long, n); data->size = size; data->granularity = granularity; - hbitmap_test_check(data, 0); + if (size) { + hbitmap_test_check(data, 0); + } } static void hbitmap_test_teardown(TestHBitmapData *data, @@ -198,14 +200,6 @@ static void test_hbitmap_iter_partial(TestHBitmapData *data, hbitmap_test_check(data, L3 / 2); } -static void test_hbitmap_iter_past(TestHBitmapData *data, - const void *unused) -{ - hbitmap_test_init(data, L3, 0); - hbitmap_test_set(data, 0, L3); - hbitmap_test_check(data, L3); -} - static void test_hbitmap_set_all(TestHBitmapData *data, const void *unused) { @@ -388,7 +382,6 @@ int main(int argc, char **argv) hbitmap_test_add("/hbitmap/size/0", test_hbitmap_zero); hbitmap_test_add("/hbitmap/size/unaligned", test_hbitmap_unaligned); hbitmap_test_add("/hbitmap/iter/empty", test_hbitmap_iter_empty); - hbitmap_test_add("/hbitmap/iter/past", test_hbitmap_iter_past); hbitmap_test_add("/hbitmap/iter/partial", test_hbitmap_iter_partial); hbitmap_test_add("/hbitmap/iter/granularity", test_hbitmap_iter_granularity); hbitmap_test_add("/hbitmap/get/all", test_hbitmap_get_all); diff --git a/util/hbitmap.c b/util/hbitmap.c index fb7e01e8c5..2aa487db74 100644 --- a/util/hbitmap.c +++ b/util/hbitmap.c @@ -147,6 +147,7 @@ void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first) hbi->hb = hb; pos = first >> hb->granularity; + assert(pos < hb->size); hbi->pos = pos >> BITS_PER_LEVEL; hbi->granularity = hb->granularity; -- cgit v1.2.3