From 05caf8dbc1880415df3378cfd114d832c9618b60 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Thu, 22 May 2008 15:13:29 +0200 Subject: block: Move the second call to get_request to the end of the loop In function get_request_wait, the second call to get_request could be moved to the end of the while loop, because if the first call to get_request fails, the second call will fail without sleep. Signed-off-by: Zhang Yanmin Signed-off-by: Jens Axboe --- block/blk-core.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 6a9cc0d22a61..1905aaba49fb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -806,35 +806,32 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, rq = get_request(q, rw_flags, bio, GFP_NOIO); while (!rq) { DEFINE_WAIT(wait); + struct io_context *ioc; struct request_list *rl = &q->rq; prepare_to_wait_exclusive(&rl->wait[rw], &wait, TASK_UNINTERRUPTIBLE); - rq = get_request(q, rw_flags, bio, GFP_NOIO); - - if (!rq) { - struct io_context *ioc; + blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); - blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); - - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); - io_schedule(); + __generic_unplug_device(q); + spin_unlock_irq(q->queue_lock); + io_schedule(); - /* - * After sleeping, we become a "batching" process and - * will be able to allocate at least one request, and - * up to a big batch of them for a small period time. - * See ioc_batching, ioc_set_batching - */ - ioc = current_io_context(GFP_NOIO, q->node); - ioc_set_batching(q, ioc); + /* + * After sleeping, we become a "batching" process and + * will be able to allocate at least one request, and + * up to a big batch of them for a small period time. + * See ioc_batching, ioc_set_batching + */ + ioc = current_io_context(GFP_NOIO, q->node); + ioc_set_batching(q, ioc); - spin_lock_irq(q->queue_lock); - } + spin_lock_irq(q->queue_lock); finish_wait(&rl->wait[rw], &wait); - } + + rq = get_request(q, rw_flags, bio, GFP_NOIO); + }; return rq; } -- cgit v1.2.3 From be754d2c2161c0cce11d62727016985ecb76831b Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Fri, 23 May 2008 06:52:00 +0200 Subject: block: reorder cfq_queue to save space on 64bit builds saves 8 bytes of padding & increases objects/slab from 30 to 32 on my AMD64 config Signed-off-by: Richard Kennedy Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b399c62936e0..4df3f0522435 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -124,6 +124,8 @@ struct cfq_data { struct cfq_queue { /* reference count */ atomic_t ref; + /* various state flags, see below */ + unsigned int flags; /* parent cfq_data */ struct cfq_data *cfqd; /* service_tree member */ @@ -138,14 +140,14 @@ struct cfq_queue { int queued[2]; /* currently allocated requests */ int allocated[2]; - /* pending metadata requests */ - int meta_pending; /* fifo list of requests in sort_list */ struct list_head fifo; unsigned long slice_end; long slice_resid; + /* pending metadata requests */ + int meta_pending; /* number of requests that are on the dispatch list or inside driver */ int dispatched; @@ -153,8 +155,6 @@ struct cfq_queue { unsigned short ioprio, org_ioprio; unsigned short ioprio_class, org_ioprio_class; - /* various state flags, see below */ - unsigned int flags; }; enum cfqq_state_flags { -- cgit v1.2.3 From 9d5f09a424a67ddb959829894efb4c71cbf6d600 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Tue, 27 May 2008 14:54:41 +0200 Subject: Added in MESSAGE notes for blktraces Allows messages to be inserted into blktrace streams. Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- block/blktrace.c | 14 ++++++++++++++ include/linux/blktrace_api.h | 25 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'block') diff --git a/block/blktrace.c b/block/blktrace.c index b2cbb4e5d767..20e11f354f11 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -75,6 +75,20 @@ static void trace_note_time(struct blk_trace *bt) local_irq_restore(flags); } +void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +{ + int n; + va_list args; + static char bt_msg_buf[BLK_TN_MAX_MSG]; + + va_start(args, fmt); + n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args); + va_end(args); + + trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n); +} +EXPORT_SYMBOL_GPL(__trace_note_message); + static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, pid_t pid) { diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index cfc3147e5cf9..b7cd8f1eedbe 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -55,6 +55,7 @@ enum blktrace_act { enum blktrace_notify { __BLK_TN_PROCESS = 0, /* establish pid/name mapping */ __BLK_TN_TIMESTAMP, /* include system clock */ + __BLK_TN_MESSAGE, /* Character string message */ }; @@ -79,6 +80,7 @@ enum blktrace_notify { #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) +#define BLK_TN_MESSAGE (__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_IO_TRACE_MAGIC 0x65617400 #define BLK_IO_TRACE_VERSION 0x07 @@ -149,7 +151,28 @@ extern void blk_trace_shutdown(struct request_queue *); extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *); extern int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct blk_user_trace_setup *buts); +extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); +/** + * blk_add_trace_msg - Add a (simple) message to the blktrace stream + * @q: queue the io is for + * @fmt: format to print message in + * args... Variable argument list for format + * + * Description: + * Records a (simple) message onto the blktrace stream. + * + * NOTE: BLK_TN_MAX_MSG characters are output at most. + * NOTE: Can not use 'static inline' due to presence of var args... + * + **/ +#define blk_add_trace_msg(q, fmt, ...) \ + do { \ + struct blk_trace *bt = (q)->blk_trace; \ + if (unlikely(bt)) \ + __trace_note_message(bt, fmt, ##__VA_ARGS__); \ + } while (0) +#define BLK_TN_MAX_MSG 1024 /** * blk_add_trace_rq - Add a trace for a request oriented action @@ -299,6 +322,8 @@ extern int blk_trace_remove(struct request_queue *q); #define blk_trace_setup(q, name, dev, arg) (-ENOTTY) #define blk_trace_startstop(q, start) (-ENOTTY) #define blk_trace_remove(q) (-ENOTTY) +#define blk_add_trace_msg(q, fmt, ...) do { } while (0) + #endif /* CONFIG_BLK_DEV_IO_TRACE */ #endif /* __KERNEL__ */ #endif -- cgit v1.2.3 From 4722dc52a891ab6cb2d637ddb87233e0ce277827 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Tue, 27 May 2008 14:55:00 +0200 Subject: Added in elevator switch message to blktrace stream Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- block/elevator.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 980f8ae147b4..902dd1344d56 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1110,6 +1110,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); spin_unlock_irq(q->queue_lock); + blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); + return 1; fail_register: -- cgit v1.2.3 From 64565911cdb57c2f512a9715b985b5617402cc67 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 May 2008 14:45:33 +0200 Subject: block: make blktrace use per-cpu buffers for message notes Currently it uses a single static char array, but that risks being corrupted when multiple users issue message notes at the same time. Make the buffers dynamically allocated when the trace is setup and make them per-cpu instead. The default max message size of 1k is also very large, the interface is mainly for small text notes. So shrink it to 128 bytes. Signed-off-by: Jens Axboe --- block/blktrace.c | 15 ++++++++++++--- include/linux/blktrace_api.h | 3 ++- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blktrace.c b/block/blktrace.c index 20e11f354f11..7ae87cc4a163 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -79,13 +79,16 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) { int n; va_list args; - static char bt_msg_buf[BLK_TN_MAX_MSG]; + char *buf; + preempt_disable(); + buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); va_start(args, fmt); - n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args); + n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + preempt_enable(); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -246,6 +249,7 @@ static void blk_trace_cleanup(struct blk_trace *bt) debugfs_remove(bt->dropped_file); blk_remove_tree(bt->dir); free_percpu(bt->sequence); + free_percpu(bt->msg_data); kfree(bt); } @@ -360,6 +364,10 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->sequence) goto err; + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); + if (!bt->msg_data) + goto err; + ret = -ENOENT; dir = blk_create_tree(buts->name); if (!dir) @@ -406,6 +414,7 @@ err: if (bt->dropped_file) debugfs_remove(bt->dropped_file); free_percpu(bt->sequence); + free_percpu(bt->msg_data); if (bt->rchan) relay_close(bt->rchan); kfree(bt); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index b7cd8f1eedbe..e3ef903aae88 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -121,6 +121,7 @@ struct blk_trace { int trace_state; struct rchan *rchan; unsigned long *sequence; + unsigned char *msg_data; u16 act_mask; u64 start_lba; u64 end_lba; @@ -172,7 +173,7 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); if (unlikely(bt)) \ __trace_note_message(bt, fmt, ##__VA_ARGS__); \ } while (0) -#define BLK_TN_MAX_MSG 1024 +#define BLK_TN_MAX_MSG 128 /** * blk_add_trace_rq - Add a trace for a request oriented action -- cgit v1.2.3 From d6de8be711b28049a5cb93c954722c311c7d3f7f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 May 2008 14:46:59 +0200 Subject: cfq-iosched: fix RCU problem in cfq_cic_lookup() cfq_cic_lookup() needs to properly protect ioc->ioc_data before dereferencing it and also exclude updaters of ioc->ioc_data as well. Also add a number of comments documenting why the existing RCU usage is OK. Thanks a lot to "Paul E. McKenney" for review and comments! Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4df3f0522435..d01b411c72f0 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1142,6 +1142,9 @@ static void cfq_put_queue(struct cfq_queue *cfqq) kmem_cache_free(cfq_pool, cfqq); } +/* + * Must always be called with the rcu_read_lock() held + */ static void __call_for_each_cic(struct io_context *ioc, void (*func)(struct io_context *, struct cfq_io_context *)) @@ -1197,6 +1200,11 @@ static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) cfq_cic_free(cic); } +/* + * Must be called with rcu_read_lock() held or preemption otherwise disabled. + * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), + * and ->trim() which is called with the task lock held + */ static void cfq_free_io_context(struct io_context *ioc) { /* @@ -1502,20 +1510,24 @@ static struct cfq_io_context * cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) { struct cfq_io_context *cic; + unsigned long flags; void *k; if (unlikely(!ioc)) return NULL; + rcu_read_lock(); + /* * we maintain a last-hit cache, to avoid browsing over the tree */ cic = rcu_dereference(ioc->ioc_data); - if (cic && cic->key == cfqd) + if (cic && cic->key == cfqd) { + rcu_read_unlock(); return cic; + } do { - rcu_read_lock(); cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); rcu_read_unlock(); if (!cic) @@ -1524,10 +1536,13 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) k = cic->key; if (unlikely(!k)) { cfq_drop_dead_cic(cfqd, ioc, cic); + rcu_read_lock(); continue; } + spin_lock_irqsave(&ioc->lock, flags); rcu_assign_pointer(ioc->ioc_data, cic); + spin_unlock_irqrestore(&ioc->lock, flags); break; } while (1); @@ -2134,6 +2149,10 @@ static void *cfq_init_queue(struct request_queue *q) static void cfq_slab_kill(void) { + /* + * Caller already ensured that pending RCU callbacks are completed, + * so we should have no busy allocations at this point. + */ if (cfq_pool) kmem_cache_destroy(cfq_pool); if (cfq_ioc_pool) @@ -2292,6 +2311,11 @@ static void __exit cfq_exit(void) ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); + + /* + * this also protects us from entering cfq_slab_kill() with + * pending RCU callbacks + */ if (elv_ioc_count_read(ioc_count)) wait_for_completion(ioc_gone); cfq_slab_kill(); -- cgit v1.2.3 From d5791d13b1d45542895104edf4b09476d5ad24b0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 9 Jun 2008 10:06:24 -0700 Subject: Fix invalid access errors in blk_lookup_devt Commit 30f2f0eb4bd2c43d10a8b0d872c6e5ad8f31c9a0 ("block: do_mounts - accept root=") extended blk_lookup_devt() to be able to look up partitions that had not yet been registered, but in the process made the assumption that the '&block_class.devices' list only contains disk devices and that you can do 'dev_to_disk(dev)' on them. That isn't actually true. The block_class device list also contains the partitions we've discovered so far, and you can't just do a 'dev_to_disk()' on those. So make sure to only work on devices that block/genhd.c has registered itself, something we can test by checking the 'dev->type' member. This makes the loop in blk_lookup_devt() match the other such loops in this file. [ We may want to do an alternate version that knows to handle _either_ whole-disk devices or partitions, but for now this is the minimal fix for a series of crashes reported by Mariusz Kozlowski in http://lkml.org/lkml/2008/5/25/25 and Ingo in http://lkml.org/lkml/2008/6/9/39 ] Reported-by: Mariusz Kozlowski Reported-by: Ingo Molnar Cc: Neil Brown Cc: Joao Luis Meloni Assirati Acked-by: Kay Sievers Cc: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- block/genhd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 129ad939f9dd..b922d4801c87 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -660,6 +660,8 @@ dev_t blk_lookup_devt(const char *name, int part) mutex_lock(&block_class_lock); list_for_each_entry(dev, &block_class.devices, node) { + if (dev->type != &disk_type) + continue; if (strcmp(dev->bus_id, name) == 0) { struct gendisk *disk = dev_to_disk(dev); -- cgit v1.2.3 From 14a73f54798f39854e521fb596da7d50b7566bbd Mon Sep 17 00:00:00 2001 From: Carl Henrik Lunde Date: Thu, 12 Jun 2008 20:13:58 +0200 Subject: block: disable IRQs until data is written to relay channel As we may run relay_reserve from interrupt context we must always disable IRQs. This is because a call to relay_reserve may expose previously written data to use space. Updated new message code and an old but related comment. Signed-off-by: Carl Henrik Lunde Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- block/blktrace.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blktrace.c b/block/blktrace.c index 7ae87cc4a163..8d3a27780260 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -79,16 +79,17 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) { int n; va_list args; + unsigned long flags; char *buf; - preempt_disable(); + local_irq_save(flags); buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); va_start(args, fmt); n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); - preempt_enable(); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -158,10 +159,7 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, /* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq - * from coming in and stepping on our toes. Once reserved, it's - * enough to get preemption disabled to prevent read of this data - * before we are through filling it. get_cpu()/put_cpu() does this - * for us + * from coming in and stepping on our toes. */ local_irq_save(flags); -- cgit v1.2.3