diff options
60 files changed, 7965 insertions, 3749 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 45d2717760fc..b8a717c4f863 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3665,13 +3665,12 @@ F: drivers/scsi/dpt* F: drivers/scsi/dpt/ DRBD DRIVER -P: Philipp Reisner -P: Lars Ellenberg -M: drbd-dev@lists.linbit.com -L: drbd-user@lists.linbit.com +M: Philipp Reisner <philipp.reisner@linbit.com> +M: Lars Ellenberg <lars.ellenberg@linbit.com> +L: drbd-dev@lists.linbit.com W: http://www.drbd.org -T: git git://git.drbd.org/linux-2.6-drbd.git drbd -T: git git://git.drbd.org/drbd-8.3.git +T: git git://git.linbit.com/linux-drbd.git +T: git git://git.linbit.com/drbd-8.4.git S: Supported F: drivers/block/drbd/ F: lib/lru_cache.c diff --git a/block/bio-integrity.c b/block/bio-integrity.c index f6325d573c10..711e4d8de6fa 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -66,7 +66,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, } if (unlikely(!bip)) - return NULL; + return ERR_PTR(-ENOMEM); memset(bip, 0, sizeof(*bip)); @@ -89,7 +89,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, return bip; err: mempool_free(bip, bs->bio_integrity_pool); - return NULL; + return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(bio_integrity_alloc); @@ -298,10 +298,10 @@ int bio_integrity_prep(struct bio *bio) /* Allocate bio integrity payload and integrity vectors */ bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); - if (unlikely(bip == NULL)) { + if (IS_ERR(bip)) { printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); - return -EIO; + return PTR_ERR(bip); } bip->bip_flags |= BIP_BLOCK_INTEGRITY; @@ -465,9 +465,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, BUG_ON(bip_src == NULL); bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); - - if (bip == NULL) - return -EIO; + if (IS_ERR(bip)) + return PTR_ERR(bip); memcpy(bip->bip_vec, bip_src->bip_vec, bip_src->bip_vcnt * sizeof(struct bio_vec)); diff --git a/block/blk-core.c b/block/blk-core.c index 476244d59309..ab51685988c2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -680,6 +680,13 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref) wake_up_all(&q->mq_freeze_wq); } +static void blk_rq_timed_out_timer(unsigned long data) +{ + struct request_queue *q = (struct request_queue *)data; + + kblockd_schedule_work(&q->timeout_work); +} + struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; @@ -841,6 +848,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) goto fail; + INIT_WORK(&q->timeout_work, blk_timeout_work); q->request_fn = rfn; q->prep_rq_fn = NULL; q->unprep_rq_fn = NULL; diff --git a/block/blk-mq.c b/block/blk-mq.c index 6889d7183a2a..4c0622fae413 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -603,8 +603,6 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, blk_mq_complete_request(rq, -EIO); return; } - if (rq->cmd_flags & REQ_NO_TIMEOUT) - return; if (time_after_eq(jiffies, rq->deadline)) { if (!blk_mark_rq_complete(rq)) @@ -615,15 +613,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, } } -static void blk_mq_rq_timer(unsigned long priv) +static void blk_mq_timeout_work(struct work_struct *work) { - struct request_queue *q = (struct request_queue *)priv; + struct request_queue *q = + container_of(work, struct request_queue, timeout_work); struct blk_mq_timeout_data data = { .next = 0, .next_set = 0, }; int i; + if (blk_queue_enter(q, true)) + return; + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); if (data.next_set) { @@ -638,6 +640,7 @@ static void blk_mq_rq_timer(unsigned long priv) blk_mq_tag_idle(hctx); } } + blk_queue_exit(q); } /* @@ -2008,7 +2011,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, hctxs[i]->queue_num = i; } - setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); + INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->nr_queues = nr_cpu_ids; diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 3610af561748..a30441a200c0 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -127,13 +127,16 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout } } -void blk_rq_timed_out_timer(unsigned long data) +void blk_timeout_work(struct work_struct *work) { - struct request_queue *q = (struct request_queue *) data; + struct request_queue *q = + container_of(work, struct request_queue, timeout_work); unsigned long flags, next = 0; struct request *rq, *tmp; int next_set = 0; + if (blk_queue_enter(q, true)) + return; spin_lock_irqsave(q->queue_lock, flags); list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) @@ -143,6 +146,7 @@ void blk_rq_timed_out_timer(unsigned long data) mod_timer(&q->timeout, round_jiffies_up(next)); spin_unlock_irqrestore(q->queue_lock, flags); + blk_queue_exit(q); } /** @@ -193,9 +197,6 @@ void blk_add_timer(struct request *req) struct request_queue *q = req->q; unsigned long expiry; - if (req->cmd_flags & REQ_NO_TIMEOUT) - return; - /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ if (!q->mq_ops && !q->rq_timed_out_fn) return; diff --git a/block/blk.h b/block/blk.h index c43926d3d74d..70e4aee9cdcb 100644 --- a/block/blk.h +++ b/block/blk.h @@ -93,7 +93,7 @@ static inline void blk_flush_integrity(void) } #endif -void blk_rq_timed_out_timer(unsigned long data); +void blk_timeout_work(struct work_struct *work); unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index b3868e7a1ffd..10459a145062 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -288,7 +288,162 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval * return need_transaction; } -static int al_write_transaction(struct drbd_device *device); +#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) +/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT + * are still coupled, or assume too much about their relation. + * Code below will not work if this is violated. + * Will be cleaned up with some followup patch. + */ +# error FIXME +#endif + +static unsigned int al_extent_to_bm_page(unsigned int al_enr) +{ + return al_enr >> + /* bit to page */ + ((PAGE_SHIFT + 3) - + /* al extent number to bit */ + (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); +} + +static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) +{ + const unsigned int stripes = device->ldev->md.al_stripes; + const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; + + /* transaction number, modulo on-disk ring buffer wrap around */ + unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); + + /* ... to aligned 4k on disk block */ + t = ((t % stripes) * stripe_size_4kB) + t/stripes; + + /* ... to 512 byte sector in activity log */ + t *= 8; + + /* ... plus offset to the on disk position */ + return device->ldev->md.md_offset + device->ldev->md.al_offset + t; +} + +static int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer) +{ + struct lc_element *e; + sector_t sector; + int i, mx; + unsigned extent_nr; + unsigned crc = 0; + int err = 0; + + memset(buffer, 0, sizeof(*buffer)); + buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); + buffer->tr_number = cpu_to_be32(device->al_tr_number); + + i = 0; + + /* Even though no one can start to change this list + * once we set the LC_LOCKED -- from drbd_al_begin_io(), + * lc_try_lock_for_transaction() --, someone may still + * be in the process of changing it. */ + spin_lock_irq(&device->al_lock); + list_for_each_entry(e, &device->act_log->to_be_changed, list) { + if (i == AL_UPDATES_PER_TRANSACTION) { + i++; + break; + } + buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); + buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); + if (e->lc_number != LC_FREE) + drbd_bm_mark_for_writeout(device, + al_extent_to_bm_page(e->lc_number)); + i++; + } + spin_unlock_irq(&device->al_lock); + BUG_ON(i > AL_UPDATES_PER_TRANSACTION); + + buffer->n_updates = cpu_to_be16(i); + for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { + buffer->update_slot_nr[i] = cpu_to_be16(-1); + buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); + } + + buffer->context_size = cpu_to_be16(device->act_log->nr_elements); + buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); + + mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, + device->act_log->nr_elements - device->al_tr_cycle); + for (i = 0; i < mx; i++) { + unsigned idx = device->al_tr_cycle + i; + extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; + buffer->context[i] = cpu_to_be32(extent_nr); + } + for (; i < AL_CONTEXT_PER_TRANSACTION; i++) + buffer->context[i] = cpu_to_be32(LC_FREE); + + device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; + if (device->al_tr_cycle >= device->act_log->nr_elements) + device->al_tr_cycle = 0; + + sector = al_tr_number_to_on_disk_sector(device); + + crc = crc32c(0, buffer, 4096); + buffer->crc32c = cpu_to_be32(crc); + + if (drbd_bm_write_hinted(device)) + err = -EIO; + else { + bool write_al_updates; + rcu_read_lock(); + write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; + rcu_read_unlock(); + if (write_al_updates) { + if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { + err = -EIO; + drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); + } else { + device->al_tr_number++; + device->al_writ_cnt++; + } + } + } + + return err; +} + +static int al_write_transaction(struct drbd_device *device) +{ + struct al_transaction_on_disk *buffer; + int err; + + if (!get_ldev(device)) { + drbd_err(device, "disk is %s, cannot start al transaction\n", + drbd_disk_str(device->state.disk)); + return -EIO; + } + + /* The bitmap write may have failed, causing a state change. */ + if (device->state.disk < D_INCONSISTENT) { + drbd_err(device, + "disk is %s, cannot write al transaction\n", + drbd_disk_str(device->state.disk)); + put_ldev(device); + return -EIO; + } + + /* protects md_io_buffer, al_tr_cycle, ... */ + buffer = drbd_md_get_buffer(device, __func__); + if (!buffer) { + drbd_err(device, "disk failed while waiting for md_io buffer\n"); + put_ldev(device); + return -ENODEV; + } + + err = __al_write_transaction(device, buffer); + + drbd_md_put_buffer(device); + put_ldev(device); + + return err; +} + void drbd_al_begin_io_commit(struct drbd_device *device) { @@ -420,153 +575,6 @@ void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i) wake_up(&device->al_wait); } -#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) -/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT - * are still coupled, or assume too much about their relation. - * Code below will not work if this is violated. - * Will be cleaned up with some followup patch. - */ -# error FIXME -#endif - -static unsigned int al_extent_to_bm_page(unsigned int al_enr) -{ - return al_enr >> - /* bit to page */ - ((PAGE_SHIFT + 3) - - /* al extent number to bit */ - (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); -} - -static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) -{ - const unsigned int stripes = device->ldev->md.al_stripes; - const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; - - /* transaction number, modulo on-disk ring buffer wrap around */ - unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); - - /* ... to aligned 4k on disk block */ - t = ((t % stripes) * stripe_size_4kB) + t/stripes; - - /* ... to 512 byte sector in activity log */ - t *= 8; - - /* ... plus offset to the on disk position */ - return device->ldev->md.md_offset + device->ldev->md.al_offset + t; -} - -int al_write_transaction(struct drbd_device *device) -{ - struct al_transaction_on_disk *buffer; - struct lc_element *e; - sector_t sector; - int i, mx; - unsigned extent_nr; - unsigned crc = 0; - int err = 0; - - if (!get_ldev(device)) { - drbd_err(device, "disk is %s, cannot start al transaction\n", - drbd_disk_str(device->state.disk)); - return -EIO; - } - - /* The bitmap write may have failed, causing a state change. */ - if (device->state.disk < D_INCONSISTENT) { - drbd_err(device, - "disk is %s, cannot write al transaction\n", - drbd_disk_str(device->state.disk)); - put_ldev(device); - return -EIO; - } - - /* protects md_io_buffer, al_tr_cycle, ... */ - buffer = drbd_md_get_buffer(device, __func__); - if (!buffer) { - drbd_err(device, "disk failed while waiting for md_io buffer\n"); - put_ldev(device); - return -ENODEV; - } - - memset(buffer, 0, sizeof(*buffer)); - buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); - buffer->tr_number = cpu_to_be32(device->al_tr_number); - - i = 0; - - /* Even though no one can start to change this list - * once we set the LC_LOCKED -- from drbd_al_begin_io(), - * lc_try_lock_for_transaction() --, someone may still - * be in the process of changing it. */ - spin_lock_irq(&device->al_lock); - list_for_each_entry(e, &device->act_log->to_be_changed, list) { - if (i == AL_UPDATES_PER_TRANSACTION) { - i++; - break; - } - buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); - buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); - if (e->lc_number != LC_FREE) - drbd_bm_mark_for_writeout(device, - al_extent_to_bm_page(e->lc_number)); - i++; - } - spin_unlock_irq(&device->al_lock); - BUG_ON(i > AL_UPDATES_PER_TRANSACTION); - - buffer->n_updates = cpu_to_be16(i); - for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { - buffer->update_slot_nr[i] = cpu_to_be16(-1); - buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); - } - - buffer->context_size = cpu_to_be16(device->act_log->nr_elements); - buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); - - mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, - device->act_log->nr_elements - device->al_tr_cycle); - for (i = 0; i < mx; i++) { - unsigned idx = device->al_tr_cycle + i; - extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; - buffer->context[i] = cpu_to_be32(extent_nr); - } - for (; i < AL_CONTEXT_PER_TRANSACTION; i++) - buffer->context[i] = cpu_to_be32(LC_FREE); - - device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; - if (device->al_tr_cycle >= device->act_log->nr_elements) - device->al_tr_cycle = 0; - - sector = al_tr_number_to_on_disk_sector(device); - - crc = crc32c(0, buffer, 4096); - buffer->crc32c = cpu_to_be32(crc); - - if (drbd_bm_write_hinted(device)) - err = -EIO; - else { - bool write_al_updates; - rcu_read_lock(); - write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; - rcu_read_unlock(); - if (write_al_updates) { - if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { - err = -EIO; - drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); - } else { - device->al_tr_number++; - device->al_writ_cnt++; - } - } - } - - drbd_md_put_buffer(device); - put_ldev(device); - - return err; -} - static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) { int rv; @@ -606,21 +614,24 @@ void drbd_al_shrink(struct drbd_device *device) wake_up(&device->al_wait); } -int drbd_initialize_al(struct drbd_device *device, void *buffer) +int drbd_al_initialize(struct drbd_device *device, void *buffer) { struct al_transaction_on_disk *al = buffer; struct drbd_md *md = &device->ldev->md; - sector_t al_base = md->md_offset + md->al_offset; int al_size_4k = md->al_stripes * md->al_stripe_size_4k; int i; - memset(al, 0, 4096); - al->magic = cpu_to_be32(DRBD_AL_MAGIC); - al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); - al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); + __al_write_transaction(device, al); + /* There may or may not have been a pending transaction. */ + spin_lock_irq(&device->al_lock); + lc_committed(device->act_log); + spin_unlock_irq(&device->al_lock); - for (i = 0; i < al_size_4k; i++) { - int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE); + /* The rest of the transactions will have an empty "updates" list, and + * are written out only to provide the context, and to initialize the + * on-disk ring buffer. */ + for (i = 1; i < al_size_4k; i++) { + int err = __al_write_transaction(device, al); if (err) return err; } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 9462d2752850..0dabc9b93725 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -24,7 +24,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/bitops.h> +#include <linux/bitmap.h> #include <linux/vmalloc.h> #include <linux/string.h> #include <linux/drbd.h> @@ -479,8 +479,14 @@ void drbd_bm_cleanup(struct drbd_device *device) * this masks out the remaining bits. * Returns the number of bits cleared. */ +#ifndef BITS_PER_PAGE #define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3)) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1) +#else +# if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3)) +# error "ambiguous BITS_PER_PAGE" +# endif +#endif #define BITS_PER_LONG_MASK (BITS_PER_LONG - 1) static int bm_clear_surplus(struct drbd_bitmap *b) { @@ -559,21 +565,19 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b) unsigned long *p_addr; unsigned long bits = 0; unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1; - int idx, i, last_word; + int idx, last_word; /* all but last page */ for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) { p_addr = __bm_map_pidx(b, idx); - for (i = 0; i < LWPP; i++) - bits += hweight_long(p_addr[i]); + bits += bitmap_weight(p_addr, BITS_PER_PAGE); __bm_unmap(p_addr); cond_resched(); } /* last (or only) page */ last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL; p_addr = __bm_map_pidx(b, idx); - for (i = 0; i < last_word; i++) - bits += hweight_long(p_addr[i]); + bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG); p_addr[last_word] &= cpu_to_lel(mask); bits += hweight_long(p_addr[last_word]); /* 32bit arch, may have an unused padding long */ @@ -1419,6 +1423,9 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, int bits; int changed = 0; unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); + + /* I think it is more cache line friendly to hweight_long then set to ~0UL, + * than to first bitmap_weight() all words, then bitmap_fill() all words */ for (i = first_word; i < last_word; i++) { bits = hweight_long(paddr[i]); paddr[i] = ~0UL; @@ -1628,8 +1635,7 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr) int n = e-s; p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); bm = p_addr + MLPP(s); - while (n--) - count += hweight_long(*bm++); + count += bitmap_weight(bm, n * BITS_PER_LONG); bm_unmap(p_addr); } else { drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s); diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index 6b88a35fb048..96a0107a72ea 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -771,6 +771,13 @@ static int device_data_gen_id_show(struct seq_file *m, void *ignored) return 0; } +static int device_ed_gen_id_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + seq_printf(m, "0x%016llX\n", (unsigned long long)device->ed_uuid); + return 0; +} + #define drbd_debugfs_device_attr(name) \ static int device_ ## name ## _open(struct inode *inode, struct file *file) \ { \ @@ -796,6 +803,7 @@ drbd_debugfs_device_attr(oldest_requests) drbd_debugfs_device_attr(act_log_extents) drbd_debugfs_device_attr(resync_extents) drbd_debugfs_device_attr(data_gen_id) +drbd_debugfs_device_attr(ed_gen_id) void drbd_debugfs_device_add(struct drbd_device *device) { @@ -839,6 +847,7 @@ void drbd_debugfs_device_add(struct drbd_device *device) DCF(act_log_extents); DCF(resync_extents); DCF(data_gen_id); + DCF(ed_gen_id); #undef DCF return; @@ -854,6 +863,7 @@ void drbd_debugfs_device_cleanup(struct drbd_device *device) drbd_debugfs_remove(&device->debugfs_vol_act_log_extents); drbd_debugfs_remove(&device->debugfs_vol_resync_extents); drbd_debugfs_remove(&device->debugfs_vol_data_gen_id); + drbd_debugfs_remove(&device->debugfs_vol_ed_gen_id); drbd_debugfs_remove(&device->debugfs_vol); } diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e66d453a5f2b..b6844feb9f9b 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -77,13 +77,6 @@ extern int fault_devs; extern char usermode_helper[]; -/* I don't remember why XCPU ... - * This is used to wake the asender, - * and to interrupt sending the sending task - * on disconnect. - */ -#define DRBD_SIG SIGXCPU - /* This is used to stop/restart our threads. * Cannot use SIGTERM nor SIGKILL, since these * are sent out by init on runlevel changes @@ -292,6 +285,9 @@ struct drbd_device_work { extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *); +extern void lock_all_resources(void); +extern void unlock_all_resources(void); + struct drbd_request { struct drbd_work w; struct drbd_device *device; @@ -504,7 +500,6 @@ enum { MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ - SUSPEND_IO, /* suspend application io */ BITMAP_IO, /* suspend application io; once no more io in flight, start bitmap io */ BITMAP_IO_QUEUED, /* Started bitmap IO */ @@ -632,12 +627,6 @@ struct bm_io_work { void (*done)(struct drbd_device *device, int rv); }; -enum write_ordering_e { - WO_none, - WO_drain_io, - WO_bdev_flush, -}; - struct fifo_buffer { unsigned int head_index; unsigned int size; @@ -650,8 +639,7 @@ extern struct fifo_buffer *fifo_alloc(int fifo_size); enum { NET_CONGESTED, /* The data socket is congested */ RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */ - SEND_PING, /* whether asender should send a ping asap */ - SIGNAL_ASENDER, /* whether asender wants to be interrupted */ + SEND_PING, GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */ CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */ CONN_WD_ST_CHG_OKAY, @@ -670,6 +658,8 @@ enum { DEVICE_WORK_PENDING, /* tell worker that some device has pending work */ }; +enum which_state { NOW, OLD = NOW, NEW }; + struct drbd_resource { char *name; #ifdef CONFIG_DEBUG_FS @@ -755,7 +745,8 @@ struct drbd_connection { unsigned long last_reconnect_jif; struct drbd_thread receiver; struct drbd_thread worker; - struct drbd_thread asender; + struct drbd_thread ack_receiver; + struct workqueue_struct *ack_sender; /* cached pointers, * so we can look up the oldest pending requests more quickly. @@ -774,6 +765,8 @@ struct drbd_connection { struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST]; struct { + unsigned long last_sent_barrier_jif; + /* whether this sender thread * has processed a single write yet. */ bool seen_any_write_yet; @@ -788,6 +781,17 @@ struct drbd_connection { } send; }; +static inline bool has_net_conf(struct drbd_connection *connection) +{ + bool has_net_conf; + + rcu_read_lock(); + has_net_conf = rcu_dereference(connection->net_conf); + rcu_read_unlock(); + + return has_net_conf; +} + void __update_timing_details( struct drbd_thread_timing_details *tdp, unsigned int *cb_nr, @@ -811,6 +815,7 @@ struct drbd_peer_device { struct list_head peer_devices; struct drbd_device *device; struct drbd_connection *connection; + struct work_struct send_acks_work; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_peer_dev; #endif @@ -829,6 +834,7 @@ struct drbd_device { struct dentry *debugfs_vol_act_log_extents; struct dentry *debugfs_vol_resync_extents; struct dentry *debugfs_vol_data_gen_id; + struct dentry *debugfs_vol_ed_gen_id; #endif unsigned int vnr; /* volume number within the connection */ @@ -873,6 +879,7 @@ struct drbd_device { atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ atomic_t unacked_cnt; /* Need to send replies for */ atomic_t local_cnt; /* Waiting for local completion */ + atomic_t suspend_cnt; /* Interval tree of pending local requests */ struct rb_root read_requests; @@ -1020,6 +1027,12 @@ static inline struct drbd_peer_device *first_peer_device(struct drbd_device *dev return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices); } +static inline struct drbd_peer_device * +conn_peer_device(struct drbd_connection *connection, int volume_number) +{ + return idr_find(&connection->peer_devices, volume_number); +} + #define for_each_resource(resource, _resources) \ list_for_each_entry(resource, _resources, resources) @@ -1113,7 +1126,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int extern int drbd_send_bitmap(struct drbd_device *device); extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); -extern void drbd_free_ldev(struct drbd_backing_dev *ldev); +extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev); extern void drbd_device_cleanup(struct drbd_device *device); void drbd_print_uuids(struct drbd_device *device, const char *text); @@ -1424,7 +1437,7 @@ extern struct bio_set *drbd_md_io_bio_set; /* to allocate from that set */ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); -extern rwlock_t global_state_lock; +extern struct mutex resources_mutex; extern int conn_lowest_minor(struct drbd_connection *connection); extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor); @@ -1454,6 +1467,9 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); /* drbd_nl.c */ + +extern struct mutex notification_mutex; + extern void drbd_suspend_io(struct drbd_device *device); extern void drbd_resume_io(struct drbd_device *device); extern char *ppsize(char *buf, unsigned long long size); @@ -1536,7 +1552,9 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); /* drbd_receiver.c */ extern int drbd_receiver(struct drbd_thread *thi); -extern int drbd_asender(struct drbd_thread *thi); +extern int drbd_ack_receiver(struct drbd_thread *thi); +extern void drbd_send_ping_wf(struct work_struct *ws); +extern void drbd_send_acks_wf(struct work_struct *ws); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, bool throttle_if_app_is_waiting); @@ -1649,7 +1667,7 @@ extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int s #define drbd_rs_failed_io(device, sector, size) \ __drbd_change_sync(device, sector, size, RECORD_RS_FAILED) extern void drbd_al_shrink(struct drbd_device *device); -extern int drbd_initialize_al(struct drbd_device *, void *); +extern int drbd_al_initialize(struct drbd_device *, void *); /* drbd_nl.c */ /* state info broadcast */ @@ -1668,6 +1686,29 @@ struct sib_info { }; void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib); +extern void notify_resource_state(struct sk_buff *, + unsigned int, + struct drbd_resource *, + struct resource_info *, + enum drbd_notification_type); +extern void notify_device_state(struct sk_buff *, + unsigned int, + struct drbd_device *, + struct device_info *, + enum drbd_notification_type); +extern void notify_connection_state(struct sk_buff *, + unsigned int, + struct drbd_connection *, + struct connection_info *, + enum drbd_notification_type); +extern void notify_peer_device_state(struct sk_buff *, + unsigned int, + struct drbd_peer_device *, + struct peer_device_info *, + enum drbd_notification_type); +extern void notify_helper(enum drbd_notification_type, struct drbd_device *, + struct drbd_connection *, const char *, int); + /* * inline helper functions *************************/ @@ -1694,19 +1735,6 @@ static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_r return 0; } -static inline enum drbd_state_rv -_drbd_set_state(struct drbd_device *device, union drbd_state ns, - enum chg_state_flags flags, struct completion *done) -{ - enum drbd_state_rv rv; - - read_lock(&global_state_lock); - rv = __drbd_set_state(device, ns, flags, done); - read_unlock(&global_state_lock); - - return rv; -} - static inline union drbd_state drbd_read_state(struct drbd_device *device) { struct drbd_resource *resource = device->resource; @@ -1937,16 +1965,21 @@ drbd_device_post_work(struct drbd_device *device, int work_bit) extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue); -static inline void wake_asender(struct drbd_connection *connection) +/* To get the ack_receiver out of the blocking network stack, + * so it can change its sk_rcvtimeo from idle- to ping-timeout, + * and send a ping, we need to send a signal. + * Which signal we send is irrelevant. */ +static inline void wake_ack_receiver(struct drbd_connection *connection) { - if (test_bit(SIGNAL_ASENDER, &connection->flags)) - force_sig(DRBD_SIG, connection->asender.task); + struct task_struct *task = connection->ack_receiver.task; + if (task && get_t_state(&connection->ack_receiver) == RUNNING) + force_sig(SIGXCPU, task); } static inline void request_ping(struct drbd_connection *connection) { set_bit(SEND_PING, &connection->flags); - wake_asender(connection); + wake_ack_receiver(connection); } extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *); @@ -2230,7 +2263,7 @@ static inline bool may_inc_ap_bio(struct drbd_device *device) if (drbd_suspended(device)) return false; - if (test_bit(SUSPEND_IO, &device->flags)) + if (atomic_read(&device->suspend_cnt)) return false; /* to avoid potential deadlock or bitmap corruption, diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 74d97f4bac34..5b43dfb79819 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -117,6 +117,7 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0 */ struct idr drbd_devices; struct list_head drbd_resources; +struct mutex resources_mutex; struct kmem_cache *drbd_request_cache; struct kmem_cache *drbd_ee_cache; /* peer requests */ @@ -1435,8 +1436,8 @@ static int we_should_drop_the_connection(struct drbd_connection *connection, str /* long elapsed = (long)(jiffies - device->last_received); */ drop_it = connection->meta.socket == sock - || !connection->asender.task - || get_t_state(&connection->asender) != RUNNING + || !connection->ack_receiver.task + || get_t_state(&connection->ack_receiver) != RUNNING || connection->cstate < C_WF_REPORT_PARAMS; if (drop_it) @@ -1793,15 +1794,6 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock, drbd_update_congested(connection); } do { - /* STRANGE - * tcp_sendmsg does _not_ use its size parameter at all ? - * - * -EAGAIN on timeout, -EINTR on signal. - */ -/* THINK - * do we need to block DRBD_SIG if sock == &meta.socket ?? - * otherwise wake_asender() might interrupt some send_*Ack ! - */ rv = kernel_sendmsg(sock, &msg, &iov, 1, size); if (rv == -EAGAIN) { if (we_should_drop_the_connection(connection, sock)) @@ -2000,7 +1992,7 @@ void drbd_device_cleanup(struct drbd_device *device) drbd_bm_cleanup(device); } - drbd_free_ldev(device->ldev); + drbd_backing_dev_free(device, device->ldev); device->ldev = NULL; clear_bit(AL_SUSPENDED, &device->flags); @@ -2179,7 +2171,7 @@ void drbd_destroy_device(struct kref *kref) if (device->this_bdev) bdput(device->this_bdev); - drbd_free_ldev(device->ldev); + drbd_backing_dev_free(device, device->ldev); device->ldev = NULL; drbd_release_all_peer_reqs(device); @@ -2563,7 +2555,7 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op cpumask_copy(resource->cpu_mask, new_cpu_mask); for_each_connection_rcu(connection, resource) { connection->receiver.reset_cpu_mask = 1; - connection->asender.reset_cpu_mask = 1; + connection->ack_receiver.reset_cpu_mask = 1; connection->worker.reset_cpu_mask = 1; } } @@ -2590,7 +2582,7 @@ struct drbd_resource *drbd_create_resource(const char *name) kref_init(&resource->kref); idr_init(&resource->devices); INIT_LIST_HEAD(&resource->connections); - resource->write_ordering = WO_bdev_flush; + resource->write_ordering = WO_BDEV_FLUSH; list_add_tail_rcu(&resource->resources, &drbd_resources); mutex_init(&resource->conf_update); mutex_init(&resource->adm_mutex); @@ -2652,8 +2644,8 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts) connection->receiver.connection = connection; drbd_thread_init(resource, &connection->worker, drbd_worker, "worker"); connection->worker.connection = connection; - drbd_thread_init(resource, &connection->asender, drbd_asender, "asender"); - connection->asender.connection = connection; + drbd_thread_init(resource, &connection->ack_receiver, drbd_ack_receiver, "ack_recv"); + connection->ack_receiver.connection = connection; kref_init(&connection->kref); @@ -2702,8 +2694,8 @@ static int init_submitter(struct drbd_device *device) { /* opencoded create_singlethread_workqueue(), * to be able to say "drbd%d", ..., minor */ - device->submit.wq = alloc_workqueue("drbd%u_submit", - WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor); + device->submit.wq = + alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor); if (!device->submit.wq) return -ENOMEM; @@ -2820,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig goto out_idr_remove_from_resource; } kref_get(&connection->kref); + INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf); } if (init_submitter(device)) { @@ -2923,7 +2916,7 @@ static int __init drbd_init(void) drbd_proc = NULL; /* play safe for drbd_cleanup */ idr_init(&drbd_devices); - rwlock_init(&global_state_lock); + mutex_init(&resources_mutex); INIT_LIST_HEAD(&drbd_resources); err = drbd_genl_register(); @@ -2971,18 +2964,6 @@ fail: return err; } -void drbd_free_ldev(struct drbd_backing_dev *ldev) -{ - if (ldev == NULL) - return; - - blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); - blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); - - kfree(ldev->disk_conf); - kfree(ldev); -} - static void drbd_free_one_sock(struct drbd_socket *ds) { struct socket *s; @@ -3277,6 +3258,10 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev) * and read it. */ bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; bdev->md.md_offset = drbd_md_ss(bdev); + /* Even for (flexible or indexed) external meta data, + * initially restrict us to the 4k superblock for now. + * Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */ + bdev->md.md_size_sect = 8; if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is @@ -3578,7 +3563,9 @@ void drbd_queue_bitmap_io(struct drbd_device *device, spin_lock_irq(&device->resource->req_lock); set_bit(BITMAP_IO, &device->flags); - if (atomic_read(&device->ap_bio_cnt) == 0) { + /* don't wait for pending application IO if the caller indicates that + * application IO does not conflict anyways. */ + if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) { if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags)) drbd_queue_work(&first_peer_device(device)->connection->sender_work, &device->bm_io_work.w); @@ -3746,6 +3733,27 @@ int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i) return 0; } +void lock_all_resources(void) +{ + struct drbd_resource *resource; + int __maybe_unused i = 0; + + mutex_lock(&resources_mutex); + local_irq_disable(); + for_each_resource(resource, &drbd_resources) + spin_lock_nested(&resource->req_lock, i++); +} + +void unlock_all_resources(void) +{ + struct drbd_resource *resource; + + for_each_resource(resource, &drbd_resources) + spin_unlock(&resource->req_lock); + local_irq_enable(); + mutex_unlock(&resources_mutex); +} + #ifdef CONFIG_DRBD_FAULT_INJECTION /* Fault insertion support including random number generator shamelessly * stolen from kernel/rcutorture.c */ diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e80cbefbc2b5..c055c5e12f24 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -36,6 +36,7 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" +#include "drbd_state_change.h" #include <asm/unaligned.h> #include <linux/drbd_limits.h> #include <linux/kthread.h> @@ -75,11 +76,24 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info); int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); /* .dumpit */ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_devices_done(struct netlink_callback *cb); +int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_connections_done(struct netlink_callback *cb); +int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb); +int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb); #include <linux/drbd_genl_api.h> #include "drbd_nla.h" #include <linux/genl_magic_func.h> +static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ +static atomic_t notify_genl_seq = ATOMIC_INIT(2); /* two. */ + +DEFINE_MUTEX(notification_mutex); + /* used blkdev_get_by_path, to claim our meta data device(s) */ static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; @@ -349,6 +363,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) sib.sib_reason = SIB_HELPER_PRE; sib.helper_name = cmd; drbd_bcast_event(device, &sib); + notify_helper(NOTIFY_CALL, device, connection, cmd, 0); ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); if (ret) drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n", @@ -361,6 +376,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) sib.sib_reason = SIB_HELPER_POST; sib.helper_exit_code = ret; drbd_bcast_event(device, &sib); + notify_helper(NOTIFY_RESPONSE, device, connection, cmd, ret); if (current == connection->worker.task) clear_bit(CALLBACK_PENDING, &connection->flags); @@ -388,6 +404,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd) drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name); /* TODO: conn_bcast_event() ?? */ + notify_helper(NOTIFY_CALL, NULL, connection, cmd, 0); ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); if (ret) @@ -399,6 +416,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd) usermode_helper, cmd, resource_name, (ret >> 8) & 0xff, ret); /* TODO: conn_bcast_event() ?? */ + notify_helper(NOTIFY_RESPONSE, NULL, connection, cmd, ret); if (ret < 0) /* Ignore any ERRNOs we got. */ ret = 0; @@ -847,9 +865,11 @@ char *ppsize(char *buf, unsigned long long size) * and can be long lived. * This changes an device->flag, is triggered by drbd internals, * and should be short-lived. */ +/* It needs to be a counter, since multiple threads might + independently suspend and resume IO. */ void drbd_suspend_io(struct drbd_device *device) { - set_bit(SUSPEND_IO, &device->flags); + atomic_inc(&device->suspend_cnt); if (drbd_suspended(device)) return; wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt)); @@ -857,8 +877,8 @@ void drbd_suspend_io(struct drbd_device *device) void drbd_resume_io(struct drbd_device *device) { - clear_bit(SUSPEND_IO, &device->flags); - wake_up(&device->misc_wait); + if (atomic_dec_and_test(&device->suspend_cnt)) + wake_up(&device->misc_wait); } /** @@ -871,27 +891,32 @@ void drbd_resume_io(struct drbd_device *device) enum determine_dev_size drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local) { - sector_t prev_first_sect, prev_size; /* previous meta location */ - sector_t la_size_sect, u_size; + struct md_offsets_and_sizes { + u64 last_agreed_sect; + u64 md_offset; + s32 al_offset; + s32 bm_offset; + u32 md_size_sect; + + u32 al_stripes; + u32 al_stripe_size_4k; + } prev; + sector_t u_size, size; struct drbd_md *md = &device->ldev->md; - u32 prev_al_stripe_size_4k; - u32 prev_al_stripes; - sector_t size; char ppb[10]; void *buffer; int md_moved, la_size_changed; enum determine_dev_size rv = DS_UNCHANGED; - /* race: - * application request passes inc_ap_bio, - * but then cannot get an AL-reference. - * this function later may wait on ap_bio_cnt == 0. -> deadlock. + /* We may change the on-disk offsets of our meta data below. Lock out + * anything that may cause meta data IO, to avoid acting on incomplete + * layout changes or scribbling over meta data that is in the process + * of being moved. * - * to avoid that: - * Suspend IO right here. - * still lock the act_log to not trigger ASSERTs there. - */ + * Move is not exactly correct, btw, currently we have all our meta + * data in core memory, to "move" it we just write it all out, there + * are no reads. */ drbd_suspend_io(device); buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */ if (!buffer) { @@ -899,19 +924,17 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct return DS_ERROR; } - /* no wait necessary anymore, actually we could assert that */ - wait_event(device->al_wait, lc_try_lock(device->act_log)); - - prev_first_sect = drbd_md_first_sector(device->ldev); - prev_size = device->ldev->md.md_size_sect; - la_size_sect = device->ldev->md.la_size_sect; + /* remember current offset and sizes */ + prev.last_agreed_sect = md->la_size_sect; + prev.md_offset = md->md_offset; + prev.al_offset = md->al_offset; + prev.bm_offset = md->bm_offset; + prev.md_size_sect = md->md_size_sect; + prev.al_stripes = md->al_stripes; + prev.al_stripe_size_4k = md->al_stripe_size_4k; if (rs) { /* rs is non NULL if we should change the AL layout only */ - - prev_al_stripes = md->al_stripes; - prev_al_stripe_size_4k = md->al_stripe_size_4k; - md->al_stripes = rs->al_stripes; md->al_stripe_size_4k = rs->al_stripe_size / 4; md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4; @@ -924,7 +947,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct rcu_read_unlock(); size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED); - if (size < la_size_sect) { + if (size < prev.last_agreed_sect) { if (rs && u_size == 0) { /* Remove "rs &&" later. This check should always be active, but right now the receiver expects the permissive behavior */ @@ -945,30 +968,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC)); if (unlikely(err)) { /* currently there is only one error: ENOMEM! */ - size = drbd_bm_capacity(device)>>1; + size = drbd_bm_capacity(device); if (size == 0) { drbd_err(device, "OUT OF MEMORY! " "Could not allocate bitmap!\n"); } else { drbd_err(device, "BM resizing failed. " - "Leaving size unchanged at size = %lu KB\n", - (unsigned long)size); + "Leaving size unchanged\n"); } rv = DS_ERROR; } /* racy, see comments above. */ drbd_set_my_capacity(device, size); - device->ldev->md.la_size_sect = size; + md->la_size_sect = size; drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), (unsigned long long)size>>1); } if (rv <= DS_ERROR) goto err_out; - la_size_changed = (la_size_sect != device->ldev->md.la_size_sect); + la_size_changed = (prev.last_agreed_sect != md->la_size_sect); - md_moved = prev_first_sect != drbd_md_first_sector(device->ldev) - || prev_size != device->ldev->md.md_size_sect; + md_moved = prev.md_offset != md->md_offset + || prev.md_size_sect != md->md_size_sect; if (la_size_changed || md_moved || rs) { u32 prev_flags; @@ -977,20 +999,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct * Clear the timer, to avoid scary "timer expired!" messages, * "Superblock" is written out at least twice below, anyways. */ del_timer(&device->md_sync_timer); - drbd_al_shrink(device); /* All extents inactive. */ + /* We won't change the "al-extents" setting, we just may need + * to move the on-disk location of the activity log ringbuffer. + * Lock for transaction is good enough, it may well be "dirty" + * or even "starving". */ + wait_event(device->al_wait, lc_try_lock_for_transaction(device->act_log)); + + /* mark current on-disk bitmap and activity log as unreliable */ prev_flags = md->flags; - md->flags &= ~MDF_PRIMARY_IND; + md->flags |= MDF_FULL_SYNC | MDF_AL_DISABLED; drbd_md_write(device, buffer); + drbd_al_initialize(device, buffer); + drbd_info(device, "Writing the whole bitmap, %s\n", la_size_changed && md_moved ? "size changed and md moved" : la_size_changed ? "size changed" : "md moved"); /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write, "size changed", BM_LOCKED_MASK); - drbd_initialize_al(device, buffer); + /* on-disk bitmap and activity log is authoritative again + * (unless there was an IO error meanwhile...) */ md->flags = prev_flags; drbd_md_write(device, buffer); @@ -999,20 +1030,22 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct md->al_stripes, md->al_stripe_size_4k * 4); } - if (size > la_size_sect) - rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO; - if (size < la_size_sect) + if (size > prev.last_agreed_sect) + rv = prev.last_agreed_sect ? DS_GREW : DS_GREW_FROM_ZERO; + if (size < prev.last_agreed_sect) rv = DS_SHRUNK; if (0) { err_out: - if (rs) { - md->al_stripes = prev_al_stripes; - md->al_stripe_size_4k = prev_al_stripe_size_4k; - md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k; - - drbd_md_set_sector_offsets(device, device->ldev); - } + /* restore previous offset and sizes */ + md->la_size_sect = prev.last_agreed_sect; + md->md_offset = prev.md_offset; + md->al_offset = prev.al_offset; + md->bm_offset = prev.bm_offset; + md->md_size_sect = prev.md_size_sect; + md->al_stripes = prev.al_stripes; + md->al_stripe_size_4k = prev.al_stripe_size_4k; + md->al_size_4k = (u64)prev.al_stripes * prev.al_stripe_size_4k; } lc_unlock(device->act_log); wake_up(&device->al_wait); @@ -1115,8 +1148,7 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc) lc_destroy(n); return -EBUSY; } else { - if (t) - lc_destroy(t); + lc_destroy(t); } drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */ return 0; @@ -1151,21 +1183,20 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi if (b) { struct drbd_connection *connection = first_peer_device(device)->connection; + blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS); + if (blk_queue_discard(b) && (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { - /* For now, don't allow more than one activity log extent worth of data - * to be discarded in one go. We may need to rework drbd_al_begin_io() - * to allow for even larger discard ranges */ - blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS); - + /* We don't care, stacking below should fix it for the local device. + * Whether or not it is a suitable granularity on the remote device + * is not our problem, really. If you care, you need to + * use devices with similar topology on all peers. */ + q->limits.discard_granularity = 512; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); - /* REALLY? Is stacking secdiscard "legal"? */ - if (blk_queue_secdiscard(b)) - queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); } else { blk_queue_max_discard_sectors(q, 0); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); - queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q); + q->limits.discard_granularity = 0; } blk_queue_stack_limits(q, b); @@ -1177,6 +1208,12 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; } } + /* To avoid confusion, if this queue does not support discard, clear + * max_discard_sectors, which is what lsblk -D reports to the user. */ + if (!blk_queue_discard(q)) { + blk_queue_max_discard_sectors(q, 0); + q->limits.discard_granularity = 0; + } } void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) @@ -1241,8 +1278,8 @@ static void conn_reconfig_done(struct drbd_connection *connection) connection->cstate == C_STANDALONE; spin_unlock_irq(&connection->resource->req_lock); if (stop_threads) { - /* asender is implicitly stopped by receiver - * in conn_disconnect() */ + /* ack_receiver thread and ack_sender workqueue are implicitly + * stopped by receiver in conn_disconnect() */ drbd_thread_stop(&connection->receiver); drbd_thread_stop(&connection->worker); } @@ -1389,13 +1426,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) goto fail_unlock; } - write_lock_irq(&global_state_lock); + lock_all_resources(); retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); if (retcode == NO_ERROR) { rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); drbd_resync_after_changed(device); } - write_unlock_irq(&global_state_lock); + unlock_all_resources(); if (retcode != NO_ERROR) goto fail_unlock; @@ -1418,7 +1455,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) set_bit(MD_NO_FUA, &device->flags); if (write_ordering_changed(old_disk_conf, new_disk_conf)) - drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush); + drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH); drbd_md_sync(device); @@ -1449,6 +1486,88 @@ success: return 0; } +static struct block_device *open_backing_dev(struct drbd_device *device, + const char *bdev_path, void *claim_ptr, bool do_bd_link) +{ + struct block_device *bdev; + int err = 0; + + bdev = blkdev_get_by_path(bdev_path, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, claim_ptr); + if (IS_ERR(bdev)) { + drbd_err(device, "open(\"%s\") failed with %ld\n", + bdev_path, PTR_ERR(bdev)); + return bdev; + } + + if (!do_bd_link) + return bdev; + + err = bd_link_disk_holder(bdev, device->vdisk); + if (err) { + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n", + bdev_path, err); + bdev = ERR_PTR(err); + } + return bdev; +} + +static int open_backing_devices(struct drbd_device *device, + struct disk_conf *new_disk_conf, + struct drbd_backing_dev *nbc) +{ + struct block_device *bdev; + + bdev = open_backing_dev(device, new_disk_conf->backing_dev, device, true); + if (IS_ERR(bdev)) + return ERR_OPEN_DISK; + nbc->backing_bdev = bdev; + + /* + * meta_dev_idx >= 0: external fixed size, possibly multiple + * drbd sharing one meta device. TODO in that case, paranoia + * check that [md_bdev, meta_dev_idx] is not yet used by some + * other drbd minor! (if you use drbd.conf + drbdadm, that + * should check it for you already; but if you don't, or + * someone fooled it, we need to double check here) + */ + bdev = open_backing_dev(device, new_disk_conf->meta_dev, + /* claim ptr: device, if claimed exclusively; shared drbd_m_holder, + * if potentially shared with other drbd minors */ + (new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder, + /* avoid double bd_claim_by_disk() for the same (source,target) tuple, + * as would happen with internal metadata. */ + (new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT && + new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL)); + if (IS_ERR(bdev)) + return ERR_OPEN_MD_DISK; + nbc->md_bdev = bdev; + return NO_ERROR; +} + +static void close_backing_dev(struct drbd_device *device, struct block_device *bdev, + bool do_bd_unlink) +{ + if (!bdev) + return; + if (do_bd_unlink) + bd_unlink_disk_holder(bdev, device->vdisk); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); +} + +void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev) +{ + if (ldev == NULL) + return; + + close_backing_dev(device, ldev->md_bdev, ldev->md_bdev != ldev->backing_bdev); + close_backing_dev(device, ldev->backing_bdev, true); + + kfree(ldev->disk_conf); + kfree(ldev); +} + int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -1462,7 +1581,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) sector_t min_md_device_sectors; struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ struct disk_conf *new_disk_conf = NULL; - struct block_device *bdev; struct lru_cache *resync_lru = NULL; struct fifo_buffer *new_plan = NULL; union drbd_state ns, os; @@ -1478,7 +1596,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) device = adm_ctx.device; mutex_lock(&adm_ctx.resource->adm_mutex); peer_device = first_peer_device(device); - connection = peer_device ? peer_device->connection : NULL; + connection = peer_device->connection; conn_reconfig_start(connection); /* if you want to reconfigure, please tear down first */ @@ -1539,12 +1657,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto fail; } - write_lock_irq(&global_state_lock); - retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); - write_unlock_irq(&global_state_lock); - if (retcode != NO_ERROR) - goto fail; - rcu_read_lock(); nc = rcu_dereference(connection->net_conf); if (nc) { @@ -1556,35 +1668,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } rcu_read_unlock(); - bdev = blkdev_get_by_path(new_disk_conf->backing_dev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, device); - if (IS_ERR(bdev)) { - drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev, - PTR_ERR(bdev)); - retcode = ERR_OPEN_DISK; - goto fail; - } - nbc->backing_bdev = bdev; - - /* - * meta_dev_idx >= 0: external fixed size, possibly multiple - * drbd sharing one meta device. TODO in that case, paranoia - * check that [md_bdev, meta_dev_idx] is not yet used by some - * other drbd minor! (if you use drbd.conf + drbdadm, that - * should check it for you already; but if you don't, or - * someone fooled it, we need to double check here) - */ - bdev = blkdev_get_by_path(new_disk_conf->meta_dev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, - (new_disk_conf->meta_dev_idx < 0) ? - (void *)device : (void *)drbd_m_holder); - if (IS_ERR(bdev)) { - drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev, - PTR_ERR(bdev)); - retcode = ERR_OPEN_MD_DISK; + retcode = open_backing_devices(device, new_disk_conf, nbc); + if (retcode != NO_ERROR) goto fail; - } - nbc->md_bdev = bdev; if ((nbc->backing_bdev == nbc->md_bdev) != (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || @@ -1707,6 +1793,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto force_diskless_dec; } + lock_all_resources(); + retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); + if (retcode != NO_ERROR) { + unlock_all_resources(); + goto force_diskless_dec; + } + /* Reset the "barriers don't work" bits here, then force meta data to * be written, to ensure we determine if barriers are supported. */ if (new_disk_conf->md_flushes) @@ -1727,7 +1820,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) new_disk_conf = NULL; new_plan = NULL; - drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush); + drbd_resync_after_changed(device); + drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH); + unlock_all_resources(); if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY)) set_bit(CRASHED_PRIMARY, &device->flags); @@ -1875,12 +1970,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) fail: conn_reconfig_done(connection); if (nbc) { - if (nbc->backing_bdev) - blkdev_put(nbc->backing_bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL); - if (nbc->md_bdev) - blkdev_put(nbc->md_bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL); + close_backing_dev(device, nbc->md_bdev, nbc->md_bdev != nbc->backing_bdev); + close_backing_dev(device, nbc->backing_bdev, true); kfree(nbc); } kfree(new_disk_conf); @@ -1895,6 +1986,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) static int adm_detach(struct drbd_device *device, int force) { enum drbd_state_rv retcode; + void *buffer; int ret; if (force) { @@ -1905,13 +1997,16 @@ static int adm_detach(struct drbd_device *device, int force) } drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ - drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */ - retcode = drbd_request_state(device, NS(disk, D_FAILED)); - drbd_md_put_buffer(device); + buffer = drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */ + if (buffer) { + retcode = drbd_request_state(device, NS(disk, D_FAILED)); + drbd_md_put_buffer(device); + } else /* already <= D_FAILED */ + retcode = SS_NOTHING_TO_DO; /* D_FAILED will transition to DISKLESS. */ + drbd_resume_io(device); ret = wait_event_interruptible(device->misc_wait, device->state.disk != D_FAILED); - drbd_resume_io(device); if ((int)retcode == (int)SS_IS_DISKLESS) retcode = SS_NOTHING_TO_DO; if (ret) @@ -2245,8 +2340,31 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) return 0; } +static void connection_to_info(struct connection_info *info, + struct drbd_connection *connection) +{ + info->conn_connection_state = connection->cstate; + info->conn_role = conn_highest_peer(connection); +} + +static void peer_device_to_info(struct peer_device_info *info, + struct drbd_peer_device *peer_device) +{ + struct drbd_device *device = peer_device->device; + + info->peer_repl_state = + max_t(enum drbd_conns, C_WF_REPORT_PARAMS, device->state.conn); + info->peer_disk_state = device->state.pdsk; + info->peer_resync_susp_user = device->state.user_isp; + info->peer_resync_susp_peer = device->state.peer_isp; + info->peer_resync_susp_dependency = device->state.aftr_isp; +} + int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) { + struct connection_info connection_info; + enum drbd_notification_type flags; + unsigned int peer_devices = 0; struct drbd_config_context adm_ctx; struct drbd_peer_device *peer_device; struct net_conf *old_net_conf, *new_net_conf = NULL; @@ -2347,6 +2465,22 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) connection->peer_addr_len = nla_len(adm_ctx.peer_addr); memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len); + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + peer_devices++; + } + + connection_to_info(&connection_info, connection); + flags = (peer_devices--) ? NOTIFY_CONTINUES : 0; + mutex_lock(¬ification_mutex); + notify_connection_state(NULL, 0, connection, &connection_info, NOTIFY_CREATE | flags); + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + struct peer_device_info peer_device_info; + + peer_device_to_info(&peer_device_info, peer_device); + flags = (peer_devices--) ? NOTIFY_CONTINUES : 0; + notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, NOTIFY_CREATE | flags); + } + mutex_unlock(¬ification_mutex); mutex_unlock(&adm_ctx.resource->conf_update); rcu_read_lock(); @@ -2428,6 +2562,8 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection drbd_err(connection, "unexpected rv2=%d in conn_try_disconnect()\n", rv2); + /* Unlike in DRBD 9, the state engine has generated + * NOTIFY_DESTROY events before clearing connection->net_conf. */ } return rv; } @@ -2585,6 +2721,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) mutex_unlock(&device->resource->conf_update); synchronize_rcu(); kfree(old_disk_conf); + new_disk_conf = NULL; } ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); @@ -2618,6 +2755,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) fail_ldev: put_ldev(device); + kfree(new_disk_conf); goto fail; } @@ -2855,7 +2993,30 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; if (test_bit(NEW_CUR_UUID, &device->flags)) { - drbd_uuid_new_current(device); + if (get_ldev_if_state(device, D_ATTACHING)) { + drbd_uuid_new_current(device); + put_ldev(device); + } else { + /* This is effectively a multi-stage "forced down". + * The NEW_CUR_UUID bit is supposedly only set, if we + * lost the replication connection, and are configured + * to freeze IO and wait for some fence-peer handler. + * So we still don't have a replication connection. + * And now we don't have a local disk either. After + * resume, we will fail all pending and new IO, because + * we don't have any data anymore. Which means we will + * eventually be able to terminate all users of this + * device, and then take it down. By bumping the + * "effective" data uuid, we make sure that you really + * need to tear down before you reconfigure, we will + * the refuse to re-connect or re-attach (because no + * matching real data uuid exists). + */ + u64 val; + get_random_bytes(&val, sizeof(u64)); + drbd_set_ed_uuid(device, val); + drbd_warn(device, "Resumed without access to data; please tear down before attempting to re-configure.\n"); + } clear_bit(NEW_CUR_UUID, &device->flags); } drbd_suspend_io(device); @@ -2910,6 +3071,486 @@ nla_put_failure: } /* + * The generic netlink dump callbacks are called outside the genl_lock(), so + * they cannot use the simple attribute parsing code which uses global + * attribute tables. + */ +static struct nlattr *find_cfg_context_attr(const struct nlmsghdr *nlh, int attr) +{ + const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; + const int maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; + struct nlattr *nla; + + nla = nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), + DRBD_NLA_CFG_CONTEXT); + if (!nla) + return NULL; + return drbd_nla_find_nested(maxtype, nla, __nla_type(attr)); +} + +static void resource_to_info(struct resource_info *, struct drbd_resource *); + +int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct drbd_genlmsghdr *dh; + struct drbd_resource *resource; + struct resource_info resource_info; + struct resource_statistics resource_statistics; + int err; + + rcu_read_lock(); + if (cb->args[0]) { + for_each_resource_rcu(resource, &drbd_resources) + if (resource == (struct drbd_resource *)cb->args[0]) + goto found_resource; + err = 0; /* resource was probably deleted */ + goto out; + } + resource = list_entry(&drbd_resources, + struct drbd_resource, resources); + +found_resource: + list_for_each_entry_continue_rcu(resource, &drbd_resources, resources) { + goto put_result; + } + err = 0; + goto out; + +put_result: + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_RESOURCES); + err = -ENOMEM; + if (!dh) + goto out; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + err = nla_put_drbd_cfg_context(skb, resource, NULL, NULL); + if (err) + goto out; + err = res_opts_to_skb(skb, &resource->res_opts, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + resource_to_info(&resource_info, resource); + err = resource_info_to_skb(skb, &resource_info, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + resource_statistics.res_stat_write_ordering = resource->write_ordering; + err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + cb->args[0] = (long)resource; + genlmsg_end(skb, dh); + err = 0; + +out: + rcu_read_unlock(); + if (err) + return err; + return skb->len; +} + +static void device_to_statistics(struct device_statistics *s, + struct drbd_device *device) +{ + memset(s, 0, sizeof(*s)); + s->dev_upper_blocked = !may_inc_ap_bio(device); + if (get_ldev(device)) { + struct drbd_md *md = &device->ldev->md; + u64 *history_uuids = (u64 *)s->history_uuids; + struct request_queue *q; + int n; + + spin_lock_irq(&md->uuid_lock); + s->dev_current_uuid = md->uuid[UI_CURRENT]; + BUILD_BUG_ON(sizeof(s->history_uuids) < UI_HISTORY_END - UI_HISTORY_START + 1); + for (n = 0; n < UI_HISTORY_END - UI_HISTORY_START + 1; n++) + history_uuids[n] = md->uuid[UI_HISTORY_START + n]; + for (; n < HISTORY_UUIDS; n++) + history_uuids[n] = 0; + s->history_uuids_len = HISTORY_UUIDS; + spin_unlock_irq(&md->uuid_lock); + + s->dev_disk_flags = md->flags; + q = bdev_get_queue(device->ldev->backing_bdev); + s->dev_lower_blocked = + bdi_congested(&q->backing_dev_info, + (1 << WB_async_congested) | + (1 << WB_sync_congested)); + put_ldev(device); + } + s->dev_size = drbd_get_capacity(device->this_bdev); + s->dev_read = device->read_cnt; + s->dev_write = device->writ_cnt; + s->dev_al_writes = device->al_writ_cnt; + s->dev_bm_writes = device->bm_writ_cnt; + s->dev_upper_pending = atomic_read(&device->ap_bio_cnt); + s->dev_lower_pending = atomic_read(&device->local_cnt); + s->dev_al_suspended = test_bit(AL_SUSPENDED, &device->flags); + s->dev_exposed_data_uuid = device->ed_uuid; +} + +static int put_resource_in_arg0(struct netlink_callback *cb, int holder_nr) +{ + if (cb->args[0]) { + struct drbd_resource *resource = + (struct drbd_resource *)cb->args[0]; + kref_put(&resource->kref, drbd_destroy_resource); + } + + return 0; +} + +int drbd_adm_dump_devices_done(struct netlink_callback *cb) { + return put_resource_in_arg0(cb, 7); +} + +static void device_to_info(struct device_info *, struct drbd_device *); + +int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *resource_filter; + struct drbd_resource *resource; + struct drbd_device *uninitialized_var(device); + int minor, err, retcode; + struct drbd_genlmsghdr *dh; + struct device_info device_info; + struct device_statistics device_statistics; + struct idr *idr_to_search; + + resource = (struct drbd_resource *)cb->args[0]; + if (!cb->args[0] && !cb->args[1]) { + resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name); + if (resource_filter) { + retcode = ERR_RES_NOT_KNOWN; + resource = drbd_find_resource(nla_data(resource_filter)); + if (!resource) + goto put_result; + cb->args[0] = (long)resource; + } + } + + rcu_read_lock(); + minor = cb->args[1]; + idr_to_search = resource ? &resource->devices : &drbd_devices; + device = idr_get_next(idr_to_search, &minor); + if (!device) { + err = 0; + goto out; + } + idr_for_each_entry_continue(idr_to_search, device, minor) { + retcode = NO_ERROR; + goto put_result; /* only one iteration */ + } + err = 0; + goto out; /* no more devices */ + +put_result: + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_DEVICES); + err = -ENOMEM; + if (!dh) + goto out; + dh->ret_code = retcode; + dh->minor = -1U; + if (retcode == NO_ERROR) { + dh->minor = device->minor; + err = nla_put_drbd_cfg_context(skb, device->resource, NULL, device); + if (err) + goto out; + if (get_ldev(device)) { + struct disk_conf *disk_conf = + rcu_dereference(device->ldev->disk_conf); + + err = disk_conf_to_skb(skb, disk_conf, !capable(CAP_SYS_ADMIN)); + put_ldev(device); + if (err) + goto out; + } + device_to_info(&device_info, device); + err = device_info_to_skb(skb, &device_info, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + + device_to_statistics(&device_statistics, device); + err = device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + cb->args[1] = minor + 1; + } + genlmsg_end(skb, dh); + err = 0; + +out: + rcu_read_unlock(); + if (err) + return err; + return skb->len; +} + +int drbd_adm_dump_connections_done(struct netlink_callback *cb) +{ + return put_resource_in_arg0(cb, 6); +} + +enum { SINGLE_RESOURCE, ITERATE_RESOURCES }; + +int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *resource_filter; + struct drbd_resource *resource = NULL, *next_resource; + struct drbd_connection *uninitialized_var(connection); + int err = 0, retcode; + struct drbd_genlmsghdr *dh; + struct connection_info connection_info; + struct connection_statistics connection_statistics; + + rcu_read_lock(); + resource = (struct drbd_resource *)cb->args[0]; + if (!cb->args[0]) { + resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name); + if (resource_filter) { + retcode = ERR_RES_NOT_KNOWN; + resource = drbd_find_resource(nla_data(resource_filter)); + if (!resource) + goto put_result; + cb->args[0] = (long)resource; + cb->args[1] = SINGLE_RESOURCE; + } + } + if (!resource) { + if (list_empty(&drbd_resources)) + goto out; + resource = list_first_entry(&drbd_resources, struct drbd_resource, resources); + kref_get(&resource->kref); + cb->args[0] = (long)resource; + cb->args[1] = ITERATE_RESOURCES; + } + + next_resource: + rcu_read_unlock(); + mutex_lock(&resource->conf_update); + rcu_read_lock(); + if (cb->args[2]) { + for_each_connection_rcu(connection, resource) + if (connection == (struct drbd_connection *)cb->args[2]) + goto found_connection; + /* connection was probably deleted */ + goto no_more_connections; + } + connection = list_entry(&resource->connections, struct drbd_connection, connections); + +found_connection: + list_for_each_entry_continue_rcu(connection, &resource->connections, connections) { + if (!has_net_conf(connection)) + continue; + retcode = NO_ERROR; + goto put_result; /* only one iteration */ + } + +no_more_connections: + if (cb->args[1] == ITERATE_RESOURCES) { + for_each_resource_rcu(next_resource, &drbd_resources) { + if (next_resource == resource) + goto found_resource; + } + /* resource was probably deleted */ + } + goto out; + +found_resource: + list_for_each_entry_continue_rcu(next_resource, &drbd_resources, resources) { + mutex_unlock(&resource->conf_update); + kref_put(&resource->kref, drbd_destroy_resource); + resource = next_resource; + kref_get(&resource->kref); + cb->args[0] = (long)resource; + cb->args[2] = 0; + goto next_resource; + } + goto out; /* no more resources */ + +put_result: + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_CONNECTIONS); + err = -ENOMEM; + if (!dh) + goto out; + dh->ret_code = retcode; + dh->minor = -1U; + if (retcode == NO_ERROR) { + struct net_conf *net_conf; + + err = nla_put_drbd_cfg_context(skb, resource, connection, NULL); + if (err) + goto out; + net_conf = rcu_dereference(connection->net_conf); + if (net_conf) { + err = net_conf_to_skb(skb, net_conf, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + } + connection_to_info(&connection_info, connection); + err = connection_info_to_skb(skb, &connection_info, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags); + err = connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + cb->args[2] = (long)connection; + } + genlmsg_end(skb, dh); + err = 0; + +out: + rcu_read_unlock(); + if (resource) + mutex_unlock(&resource->conf_update); + if (err) + return err; + return skb->len; +} + +enum mdf_peer_flag { + MDF_PEER_CONNECTED = 1 << 0, + MDF_PEER_OUTDATED = 1 << 1, + MDF_PEER_FENCING = 1 << 2, + MDF_PEER_FULL_SYNC = 1 << 3, +}; + +static void peer_device_to_statistics(struct peer_device_statistics *s, + struct drbd_peer_device *peer_device) +{ + struct drbd_device *device = peer_device->device; + + memset(s, 0, sizeof(*s)); + s->peer_dev_received = device->recv_cnt; + s->peer_dev_sent = device->send_cnt; + s->peer_dev_pending = atomic_read(&device->ap_pending_cnt) + + atomic_read(&device->rs_pending_cnt); + s->peer_dev_unacked = atomic_read(&device->unacked_cnt); + s->peer_dev_out_of_sync = drbd_bm_total_weight(device) << (BM_BLOCK_SHIFT - 9); + s->peer_dev_resync_failed = device->rs_failed << (BM_BLOCK_SHIFT - 9); + if (get_ldev(device)) { + struct drbd_md *md = &device->ldev->md; + + spin_lock_irq(&md->uuid_lock); + s->peer_dev_bitmap_uuid = md->uuid[UI_BITMAP]; + spin_unlock_irq(&md->uuid_lock); + s->peer_dev_flags = + (drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND) ? + MDF_PEER_CONNECTED : 0) + + (drbd_md_test_flag(device->ldev, MDF_CONSISTENT) && + !drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE) ? + MDF_PEER_OUTDATED : 0) + + /* FIXME: MDF_PEER_FENCING? */ + (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ? + MDF_PEER_FULL_SYNC : 0); + put_ldev(device); + } +} + +int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb) +{ + return put_resource_in_arg0(cb, 9); +} + +int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *resource_filter; + struct drbd_resource *resource; + struct drbd_device *uninitialized_var(device); + struct drbd_peer_device *peer_device = NULL; + int minor, err, retcode; + struct drbd_genlmsghdr *dh; + struct idr *idr_to_search; + + resource = (struct drbd_resource *)cb->args[0]; + if (!cb->args[0] && !cb->args[1]) { + resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name); + if (resource_filter) { + retcode = ERR_RES_NOT_KNOWN; + resource = drbd_find_resource(nla_data(resource_filter)); + if (!resource) + goto put_result; + } + cb->args[0] = (long)resource; + } + + rcu_read_lock(); + minor = cb->args[1]; + idr_to_search = resource ? &resource->devices : &drbd_devices; + device = idr_find(idr_to_search, minor); + if (!device) { +next_device: + minor++; + cb->args[2] = 0; + device = idr_get_next(idr_to_search, &minor); + if (!device) { + err = 0; + goto out; + } + } + if (cb->args[2]) { + for_each_peer_device(peer_device, device) + if (peer_device == (struct drbd_peer_device *)cb->args[2]) + goto found_peer_device; + /* peer device was probably deleted */ + goto next_device; + } + /* Make peer_device point to the list head (not the first entry). */ + peer_device = list_entry(&device->peer_devices, struct drbd_peer_device, peer_devices); + +found_peer_device: + list_for_each_entry_continue_rcu(peer_device, &device->peer_devices, peer_devices) { + if (!has_net_conf(peer_device->connection)) + continue; + retcode = NO_ERROR; + goto put_result; /* only one iteration */ + } + goto next_device; + +put_result: + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_PEER_DEVICES); + err = -ENOMEM; + if (!dh) + goto out; + dh->ret_code = retcode; + dh->minor = -1U; + if (retcode == NO_ERROR) { + struct peer_device_info peer_device_info; + struct peer_device_statistics peer_device_statistics; + + dh->minor = minor; + err = nla_put_drbd_cfg_context(skb, device->resource, peer_device->connection, device); + if (err) + goto out; + peer_device_to_info(&peer_device_info, peer_device); + err = peer_device_info_to_skb(skb, &peer_device_info, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + peer_device_to_statistics(&peer_device_statistics, peer_device); + err = peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + cb->args[1] = minor; + cb->args[2] = (long)peer_device; + } + genlmsg_end(skb, dh); + err = 0; + +out: + rcu_read_unlock(); + if (err) + return err; + return skb->len; +} +/* * Return the connection of @resource if @resource has exactly one connection. */ static struct drbd_connection *the_only_connection(struct drbd_resource *resource) @@ -3414,8 +4055,18 @@ drbd_check_resource_name(struct drbd_config_context *adm_ctx) return NO_ERROR; } +static void resource_to_info(struct resource_info *info, + struct drbd_resource *resource) +{ + info->res_role = conn_highest_role(first_connection(resource)); + info->res_susp = resource->susp; + info->res_susp_nod = resource->susp_nod; + info->res_susp_fen = resource->susp_fen; +} + int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) { + struct drbd_connection *connection; struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct res_opts res_opts; @@ -3449,13 +4100,33 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) } /* not yet safe for genl_family.parallel_ops */ - if (!conn_create(adm_ctx.resource_name, &res_opts)) + mutex_lock(&resources_mutex); + connection = conn_create(adm_ctx.resource_name, &res_opts); + mutex_unlock(&resources_mutex); + + if (connection) { + struct resource_info resource_info; + + mutex_lock(¬ification_mutex); + resource_to_info(&resource_info, connection->resource); + notify_resource_state(NULL, 0, connection->resource, + &resource_info, NOTIFY_CREATE); + mutex_unlock(¬ification_mutex); + } else retcode = ERR_NOMEM; + out: drbd_adm_finish(&adm_ctx, info, retcode); return 0; } +static void device_to_info(struct device_info *info, + struct drbd_device *device) +{ + info->dev_disk_state = device->state.disk; +} + + int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -3490,6 +4161,36 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); retcode = drbd_create_device(&adm_ctx, dh->minor); + if (retcode == NO_ERROR) { + struct drbd_device *device; + struct drbd_peer_device *peer_device; + struct device_info info; + unsigned int peer_devices = 0; + enum drbd_notification_type flags; + + device = minor_to_device(dh->minor); + for_each_peer_device(peer_device, device) { + if (!has_net_conf(peer_device->connection)) + continue; + peer_devices++; + } + + device_to_info(&info, device); + mutex_lock(¬ification_mutex); + flags = (peer_devices--) ? NOTIFY_CONTINUES : 0; + notify_device_state(NULL, 0, device, &info, NOTIFY_CREATE | flags); + for_each_peer_device(peer_device, device) { + struct peer_device_info peer_device_info; + + if (!has_net_conf(peer_device->connection)) + continue; + peer_device_to_info(&peer_device_info, peer_device); + flags = (peer_devices--) ? NOTIFY_CONTINUES : 0; + notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, + NOTIFY_CREATE | flags); + } + mutex_unlock(¬ification_mutex); + } mutex_unlock(&adm_ctx.resource->adm_mutex); out: drbd_adm_finish(&adm_ctx, info, retcode); @@ -3498,13 +4199,35 @@ out: static enum drbd_ret_code adm_del_minor(struct drbd_device *device) { + struct drbd_peer_device *peer_device; + if (device->state.disk == D_DISKLESS && /* no need to be device->state.conn == C_STANDALONE && * we may want to delete a minor from a live replication group. */ device->state.role == R_SECONDARY) { + struct drbd_connection *connection = + first_connection(device->resource); + _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE + CS_WAIT_COMPLETE); + + /* If the state engine hasn't stopped the sender thread yet, we + * need to flush the sender work queue before generating the + * DESTROY events here. */ + if (get_t_state(&connection->worker) == RUNNING) + drbd_flush_workqueue(&connection->sender_work); + + mutex_lock(¬ification_mutex); + for_each_peer_device(peer_device, device) { + if (!has_net_conf(peer_device->connection)) + continue; + notify_peer_device_state(NULL, 0, peer_device, NULL, + NOTIFY_DESTROY | NOTIFY_CONTINUES); + } + notify_device_state(NULL, 0, device, NULL, NOTIFY_DESTROY); + mutex_unlock(¬ification_mutex); + drbd_delete_device(device); return NO_ERROR; } else @@ -3541,7 +4264,16 @@ static int adm_del_resource(struct drbd_resource *resource) if (!idr_is_empty(&resource->devices)) return ERR_RES_IN_USE; + /* The state engine has stopped the sender thread, so we don't + * need to flush the sender work queue before generating the + * DESTROY event here. */ + mutex_lock(¬ification_mutex); + notify_resource_state(NULL, 0, resource, NULL, NOTIFY_DESTROY); + mutex_unlock(¬ification_mutex); + + mutex_lock(&resources_mutex); list_del_rcu(&resource->resources); + mutex_unlock(&resources_mutex); /* Make sure all threads have actually stopped: state handling only * does drbd_thread_stop_nowait(). */ list_for_each_entry(connection, &resource->connections, connections) @@ -3637,7 +4369,6 @@ finish: void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib) { - static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ struct sk_buff *msg; struct drbd_genlmsghdr *d_out; unsigned seq; @@ -3658,7 +4389,7 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib) if (nla_put_status_info(msg, device, sib)) goto nla_put_failure; genlmsg_end(msg, d_out); - err = drbd_genl_multicast_events(msg, 0); + err = drbd_genl_multicast_events(msg, GFP_NOWAIT); /* msg has been consumed or freed in netlink_broadcast() */ if (err && err != -ESRCH) goto failed; @@ -3672,3 +4403,405 @@ failed: "Event seq:%u sib_reason:%u\n", err, seq, sib->sib_reason); } + +static int nla_put_notification_header(struct sk_buff *msg, + enum drbd_notification_type type) +{ + struct drbd_notification_header nh = { + .nh_type = type, + }; + + return drbd_notification_header_to_skb(msg, &nh, true); +} + +void notify_resource_state(struct sk_buff *skb, + unsigned int seq, + struct drbd_resource *resource, + struct resource_info *resource_info, + enum drbd_notification_type type) +{ + struct resource_statistics resource_statistics; + struct drbd_genlmsghdr *dh; + bool multicast = false; + int err; + + if (!skb) { + seq = atomic_inc_return(¬ify_genl_seq); + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto failed; + multicast = true; + } + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_RESOURCE_STATE); + if (!dh) + goto nla_put_failure; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, resource, NULL, NULL) || + nla_put_notification_header(skb, type) || + ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY && + resource_info_to_skb(skb, resource_info, true))) + goto nla_put_failure; + resource_statistics.res_stat_write_ordering = resource->write_ordering; + err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto nla_put_failure; + genlmsg_end(skb, dh); + if (multicast) { + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + } + return; + +nla_put_failure: + nlmsg_free(skb); +failed: + drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +void notify_device_state(struct sk_buff *skb, + unsigned int seq, + struct drbd_device *device, + struct device_info *device_info, + enum drbd_notification_type type) +{ + struct device_statistics device_statistics; + struct drbd_genlmsghdr *dh; + bool multicast = false; + int err; + + if (!skb) { + seq = atomic_inc_return(¬ify_genl_seq); + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto failed; + multicast = true; + } + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_DEVICE_STATE); + if (!dh) + goto nla_put_failure; + dh->minor = device->minor; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, device->resource, NULL, device) || + nla_put_notification_header(skb, type) || + ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY && + device_info_to_skb(skb, device_info, true))) + goto nla_put_failure; + device_to_statistics(&device_statistics, device); + device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN)); + genlmsg_end(skb, dh); + if (multicast) { + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + } + return; + +nla_put_failure: + nlmsg_free(skb); +failed: + drbd_err(device, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +void notify_connection_state(struct sk_buff *skb, + unsigned int seq, + struct drbd_connection *connection, + struct connection_info *connection_info, + enum drbd_notification_type type) +{ + struct connection_statistics connection_statistics; + struct drbd_genlmsghdr *dh; + bool multicast = false; + int err; + + if (!skb) { + seq = atomic_inc_return(¬ify_genl_seq); + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto failed; + multicast = true; + } + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_CONNECTION_STATE); + if (!dh) + goto nla_put_failure; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, connection->resource, connection, NULL) || + nla_put_notification_header(skb, type) || + ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY && + connection_info_to_skb(skb, connection_info, true))) + goto nla_put_failure; + connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags); + connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN)); + genlmsg_end(skb, dh); + if (multicast) { + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + } + return; + +nla_put_failure: + nlmsg_free(skb); +failed: + drbd_err(connection, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +void notify_peer_device_state(struct sk_buff *skb, + unsigned int seq, + struct drbd_peer_device *peer_device, + struct peer_device_info *peer_device_info, + enum drbd_notification_type type) +{ + struct peer_device_statistics peer_device_statistics; + struct drbd_resource *resource = peer_device->device->resource; + struct drbd_genlmsghdr *dh; + bool multicast = false; + int err; + + if (!skb) { + seq = atomic_inc_return(¬ify_genl_seq); + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto failed; + multicast = true; + } + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_PEER_DEVICE_STATE); + if (!dh) + goto nla_put_failure; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, resource, peer_device->connection, peer_device->device) || + nla_put_notification_header(skb, type) || + ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY && + peer_device_info_to_skb(skb, peer_device_info, true))) + goto nla_put_failure; + peer_device_to_statistics(&peer_device_statistics, peer_device); + peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN)); + genlmsg_end(skb, dh); + if (multicast) { + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + } + return; + +nla_put_failure: + nlmsg_free(skb); +failed: + drbd_err(peer_device, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +void notify_helper(enum drbd_notification_type type, + struct drbd_device *device, struct drbd_connection *connection, + const char *name, int status) +{ + struct drbd_resource *resource = device ? device->resource : connection->resource; + struct drbd_helper_info helper_info; + unsigned int seq = atomic_inc_return(¬ify_genl_seq); + struct sk_buff *skb = NULL; + struct drbd_genlmsghdr *dh; + int err; + + strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name)); + helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name)); + helper_info.helper_status = status; + + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto fail; + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_HELPER); + if (!dh) + goto fail; + dh->minor = device ? device->minor : -1; + dh->ret_code = NO_ERROR; + mutex_lock(¬ification_mutex); + if (nla_put_drbd_cfg_context(skb, resource, connection, device) || + nla_put_notification_header(skb, type) || + drbd_helper_info_to_skb(skb, &helper_info, true)) + goto unlock_fail; + genlmsg_end(skb, dh); + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + skb = NULL; + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto unlock_fail; + mutex_unlock(¬ification_mutex); + return; + +unlock_fail: + mutex_unlock(¬ification_mutex); +fail: + nlmsg_free(skb); + drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq) +{ + struct drbd_genlmsghdr *dh; + int err; + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_INITIAL_STATE_DONE); + if (!dh) + goto nla_put_failure; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_notification_header(skb, NOTIFY_EXISTS)) + goto nla_put_failure; + genlmsg_end(skb, dh); + return; + +nla_put_failure: + nlmsg_free(skb); + pr_err("Error %d sending event. Event seq:%u\n", err, seq); +} + +static void free_state_changes(struct list_head *list) +{ + while (!list_empty(list)) { + struct drbd_state_change *state_change = + list_first_entry(list, struct drbd_state_change, list); + list_del(&state_change->list); + forget_state_change(state_change); + } +} + +static unsigned int notifications_for_state_change(struct drbd_state_change *state_change) +{ + return 1 + + state_change->n_connections + + state_change->n_devices + + state_change->n_devices * state_change->n_connections; +} + +static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct drbd_state_change *state_change = (struct drbd_state_change *)cb->args[0]; + unsigned int seq = cb->args[2]; + unsigned int n; + enum drbd_notification_type flags = 0; + + /* There is no need for taking notification_mutex here: it doesn't + matter if the initial state events mix with later state chage + events; we can always tell the events apart by the NOTIFY_EXISTS + flag. */ + + cb->args[5]--; + if (cb->args[5] == 1) { + notify_initial_state_done(skb, seq); + goto out; + } + n = cb->args[4]++; + if (cb->args[4] < cb->args[3]) + flags |= NOTIFY_CONTINUES; + if (n < 1) { + notify_resource_state_change(skb, seq, state_change->resource, + NOTIFY_EXISTS | flags); + goto next; + } + n--; + if (n < state_change->n_connections) { + notify_connection_state_change(skb, seq, &state_change->connections[n], + NOTIFY_EXISTS | flags); + goto next; + } + n -= state_change->n_connections; + if (n < state_change->n_devices) { + notify_device_state_change(skb, seq, &state_change->devices[n], + NOTIFY_EXISTS | flags); + goto next; + } + n -= state_change->n_devices; + if (n < state_change->n_devices * state_change->n_connections) { + notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n], + NOTIFY_EXISTS | flags); + goto next; + } + +next: + if (cb->args[4] == cb->args[3]) { + struct drbd_state_change *next_state_change = + list_entry(state_change->list.next, + struct drbd_state_change, list); + cb->args[0] = (long)next_state_change; + cb->args[3] = notifications_for_state_change(next_state_change); + cb->args[4] = 0; + } +out: + return skb->len; +} + +int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct drbd_resource *resource; + LIST_HEAD(head); + + if (cb->args[5] >= 1) { + if (cb->args[5] > 1) + return get_initial_state(skb, cb); + if (cb->args[0]) { + struct drbd_state_change *state_change = + (struct drbd_state_change *)cb->args[0]; + + /* connect list to head */ + list_add(&head, &state_change->list); + free_state_changes(&head); + } + return 0; + } + + cb->args[5] = 2; /* number of iterations */ + mutex_lock(&resources_mutex); + for_each_resource(resource, &drbd_resources) { + struct drbd_state_change *state_change; + + state_change = remember_old_state(resource, GFP_KERNEL); + if (!state_change) { + if (!list_empty(&head)) + free_state_changes(&head); + mutex_unlock(&resources_mutex); + return -ENOMEM; + } + copy_old_to_new_state_change(state_change); + list_add_tail(&state_change->list, &head); + cb->args[5] += notifications_for_state_change(state_change); + } + mutex_unlock(&resources_mutex); + + if (!list_empty(&head)) { + struct drbd_state_change *state_change = + list_entry(head.next, struct drbd_state_change, list); + cb->args[0] = (long)state_change; + cb->args[3] = notifications_for_state_change(state_change); + list_del(&head); /* detach list from head */ + } + + cb->args[2] = cb->nlh->nlmsg_seq; + return get_initial_state(skb, cb); +} diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 3b10fa6cb039..6537b25db9c1 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -245,9 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v) char wp; static char write_ordering_chars[] = { - [WO_none] = 'n', - [WO_drain_io] = 'd', - [WO_bdev_flush] = 'f', + [WO_NONE] = 'n', + [WO_DRAIN_IO] = 'd', + [WO_BDEV_FLUSH] = 'f', }; seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index 2da9104a3851..ef9245363dcc 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -23,7 +23,7 @@ enum drbd_packet { P_AUTH_RESPONSE = 0x11, P_STATE_CHG_REQ = 0x12, - /* asender (meta socket */ + /* (meta socket) */ P_PING = 0x13, P_PING_ACK = 0x14, P_RECV_ACK = 0x15, /* Used in protocol B */ diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index b4b5680ac6ad..1957fe8601dc 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -215,7 +215,7 @@ static void reclaim_finished_net_peer_reqs(struct drbd_device *device, } } -static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) +static void drbd_reclaim_net_peer_reqs(struct drbd_device *device) { LIST_HEAD(reclaimed); struct drbd_peer_request *peer_req, *t; @@ -223,11 +223,30 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) spin_lock_irq(&device->resource->req_lock); reclaim_finished_net_peer_reqs(device, &reclaimed); spin_unlock_irq(&device->resource->req_lock); - list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) drbd_free_net_peer_req(device, peer_req); } +static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (!atomic_read(&device->pp_in_use_by_net)) + continue; + + kref_get(&device->kref); + rcu_read_unlock(); + drbd_reclaim_net_peer_reqs(device); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); +} + /** * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) * @device: DRBD device. @@ -265,10 +284,15 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int if (atomic_read(&device->pp_in_use) < mxb) page = __drbd_alloc_pages(device, number); + /* Try to keep the fast path fast, but occasionally we need + * to reclaim the pages we lended to the network stack. */ + if (page && atomic_read(&device->pp_in_use_by_net) > 512) + drbd_reclaim_net_peer_reqs(device); + while (page == NULL) { prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); - drbd_kick_lo_and_reclaim_net(device); + drbd_reclaim_net_peer_reqs(device); if (atomic_read(&device->pp_in_use) < mxb) { page = __drbd_alloc_pages(device, number); @@ -1099,7 +1123,15 @@ randomize: return 0; } - drbd_thread_start(&connection->asender); + drbd_thread_start(&connection->ack_receiver); + /* opencoded create_singlethread_workqueue(), + * to be able to use format string arguments */ + connection->ack_sender = + alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name); + if (!connection->ack_sender) { + drbd_err(connection, "Failed to create workqueue ack_sender\n"); + return 0; + } mutex_lock(&connection->resource->conf_update); /* The discard_my_data flag is a single-shot modifier to the next @@ -1178,7 +1210,7 @@ static void drbd_flush(struct drbd_connection *connection) struct drbd_peer_device *peer_device; int vnr; - if (connection->resource->write_ordering >= WO_bdev_flush) { + if (connection->resource->write_ordering >= WO_BDEV_FLUSH) { rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; @@ -1203,7 +1235,7 @@ static void drbd_flush(struct drbd_connection *connection) /* would rather check on EOPNOTSUPP, but that is not reliable. * don't try again for ANY return value != 0 * if (rv == -EOPNOTSUPP) */ - drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io); + drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO); } put_ldev(device); kref_put(&device->kref, drbd_destroy_device); @@ -1299,10 +1331,10 @@ max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo) dc = rcu_dereference(bdev->disk_conf); - if (wo == WO_bdev_flush && !dc->disk_flushes) - wo = WO_drain_io; - if (wo == WO_drain_io && !dc->disk_drain) - wo = WO_none; + if (wo == WO_BDEV_FLUSH && !dc->disk_flushes) + wo = WO_DRAIN_IO; + if (wo == WO_DRAIN_IO && !dc->disk_drain) + wo = WO_NONE; return wo; } @@ -1319,13 +1351,13 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin enum write_ordering_e pwo; int vnr; static char *write_ordering_str[] = { - [WO_none] = "none", - [WO_drain_io] = "drain", - [WO_bdev_flush] = "flush", + [WO_NONE] = "none", + [WO_DRAIN_IO] = "drain", + [WO_BDEV_FLUSH] = "flush", }; pwo = resource->write_ordering; - if (wo != WO_bdev_flush) + if (wo != WO_BDEV_FLUSH) wo = min(pwo, wo); rcu_read_lock(); idr_for_each_entry(&resource->devices, device, vnr) { @@ -1343,7 +1375,7 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin rcu_read_unlock(); resource->write_ordering = wo; - if (pwo != resource->write_ordering || wo == WO_bdev_flush) + if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH) drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); } @@ -1380,7 +1412,7 @@ int drbd_submit_peer_request(struct drbd_device *device, if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { /* wait for all pending IO completions, before we start * zeroing things out. */ - conn_wait_active_ee_empty(first_peer_device(device)->connection); + conn_wait_active_ee_empty(peer_req->peer_device->connection); /* add it to the active list now, * so we can find it to present it in debugfs */ peer_req->submit_jif = jiffies; @@ -1508,12 +1540,6 @@ static void conn_wait_active_ee_empty(struct drbd_connection *connection) rcu_read_unlock(); } -static struct drbd_peer_device * -conn_peer_device(struct drbd_connection *connection, int volume_number) -{ - return idr_find(&connection->peer_devices, volume_number); -} - static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi) { int rv; @@ -1533,7 +1559,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf * Therefore we must send the barrier_ack after the barrier request was * completed. */ switch (connection->resource->write_ordering) { - case WO_none: + case WO_NONE: if (rv == FE_RECYCLED) return 0; @@ -1546,8 +1572,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf drbd_warn(connection, "Allocation of an epoch failed, slowing down\n"); /* Fall through */ - case WO_bdev_flush: - case WO_drain_io: + case WO_BDEV_FLUSH: + case WO_DRAIN_IO: conn_wait_active_ee_empty(connection); drbd_flush(connection); @@ -1752,7 +1778,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req } /* - * e_end_resync_block() is called in asender context via + * e_end_resync_block() is called in ack_sender context via * drbd_finish_peer_reqs(). */ static int e_end_resync_block(struct drbd_work *w, int unused) @@ -1926,7 +1952,7 @@ static void restart_conflicting_writes(struct drbd_device *device, } /* - * e_end_block() is called in asender context via drbd_finish_peer_reqs(). + * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs(). */ static int e_end_block(struct drbd_work *w, int cancel) { @@ -1966,7 +1992,7 @@ static int e_end_block(struct drbd_work *w, int cancel) } else D_ASSERT(device, drbd_interval_empty(&peer_req->i)); - drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); + drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); return err; } @@ -2098,7 +2124,7 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co } rcu_read_lock(); - tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries; + tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries; rcu_read_unlock(); if (!tp) @@ -2217,7 +2243,7 @@ static int handle_write_conflicts(struct drbd_device *device, peer_req->w.cb = superseded ? e_send_superseded : e_send_retry_write; list_add_tail(&peer_req->w.list, &device->done_ee); - wake_asender(connection); + queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work); err = -ENOENT; goto out; @@ -2364,7 +2390,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * if (dp_flags & DP_SEND_RECEIVE_ACK) { /* I really don't like it that the receiver thread * sends on the msock, but anyways */ - drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req); + drbd_send_ack(peer_device, P_RECV_ACK, peer_req); } if (tp) { @@ -4056,7 +4082,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info os = ns = drbd_read_state(device); spin_unlock_irq(&device->resource->req_lock); - /* If some other part of the code (asender thread, timeout) + /* If some other part of the code (ack_receiver thread, timeout) * already decided to close the connection again, * we must not "re-establish" it here. */ if (os.conn <= C_TEAR_DOWN) @@ -4661,8 +4687,12 @@ static void conn_disconnect(struct drbd_connection *connection) */ conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); - /* asender does not clean up anything. it must not interfere, either */ - drbd_thread_stop(&connection->asender); + /* ack_receiver does not clean up anything. it must not interfere, either */ + drbd_thread_stop(&connection->ack_receiver); + if (connection->ack_sender) { + destroy_workqueue(connection->ack_sender); + connection->ack_sender = NULL; + } drbd_free_sock(connection); rcu_read_lock(); @@ -5431,49 +5461,39 @@ static int got_skip(struct drbd_connection *connection, struct packet_info *pi) return 0; } -static int connection_finish_peer_reqs(struct drbd_connection *connection) +struct meta_sock_cmd { + size_t pkt_size; + int (*fn)(struct drbd_connection *connection, struct packet_info *); +}; + +static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout) { - struct drbd_peer_device *peer_device; - int vnr, not_empty = 0; + long t; + struct net_conf *nc; - do { - clear_bit(SIGNAL_ASENDER, &connection->flags); - flush_signals(current); + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + t = ping_timeout ? nc->ping_timeo : nc->ping_int; + rcu_read_unlock(); - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - kref_get(&device->kref); - rcu_read_unlock(); - if (drbd_finish_peer_reqs(device)) { - kref_put(&device->kref, drbd_destroy_device); - return 1; - } - kref_put(&device->kref, drbd_destroy_device); - rcu_read_lock(); - } - set_bit(SIGNAL_ASENDER, &connection->flags); + t *= HZ; + if (ping_timeout) + t /= 10; - spin_lock_irq(&connection->resource->req_lock); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - not_empty = !list_empty(&device->done_ee); - if (not_empty) - break; - } - spin_unlock_irq(&connection->resource->req_lock); - rcu_read_unlock(); - } while (not_empty); + connection->meta.socket->sk->sk_rcvtimeo = t; +} - return 0; +static void set_ping_timeout(struct drbd_connection *connection) +{ + set_rcvtimeo(connection, 1); } -struct asender_cmd { - size_t pkt_size; - int (*fn)(struct drbd_connection *connection, struct packet_info *); -}; +static void set_idle_timeout(struct drbd_connection *connection) +{ + set_rcvtimeo(connection, 0); +} -static struct asender_cmd asender_tbl[] = { +static struct meta_sock_cmd ack_receiver_tbl[] = { [P_PING] = { 0, got_Ping }, [P_PING_ACK] = { 0, got_PingAck }, [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, @@ -5493,64 +5513,40 @@ static struct asender_cmd asender_tbl[] = { [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, }; -int drbd_asender(struct drbd_thread *thi) +int drbd_ack_receiver(struct drbd_thread *thi) { struct drbd_connection *connection = thi->connection; - struct asender_cmd *cmd = NULL; + struct meta_sock_cmd *cmd = NULL; struct packet_info pi; + unsigned long pre_recv_jif; int rv; void *buf = connection->meta.rbuf; int received = 0; unsigned int header_size = drbd_header_size(connection); int expect = header_size; bool ping_timeout_active = false; - struct net_conf *nc; - int ping_timeo, tcp_cork, ping_int; struct sched_param param = { .sched_priority = 2 }; rv = sched_setscheduler(current, SCHED_RR, ¶m); if (rv < 0) - drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv); + drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv); while (get_t_state(thi) == RUNNING) { drbd_thread_current_set_cpu(thi); - rcu_read_lock(); - nc = rcu_dereference(connection->net_conf); - ping_timeo = nc->ping_timeo; - tcp_cork = nc->tcp_cork; - ping_int = nc->ping_int; - rcu_read_unlock(); + conn_reclaim_net_peer_reqs(connection); if (test_and_clear_bit(SEND_PING, &connection->flags)) { if (drbd_send_ping(connection)) { drbd_err(connection, "drbd_send_ping has failed\n"); goto reconnect; } - connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10; + set_ping_timeout(connection); ping_timeout_active = true; } - /* TODO: conditionally cork; it may hurt latency if we cork without - much to send */ - if (tcp_cork) - drbd_tcp_cork(connection->meta.socket); - if (connection_finish_peer_reqs(connection)) { - drbd_err(connection, "connection_finish_peer_reqs() failed\n"); - goto reconnect; - } - /* but unconditionally uncork unless disabled */ - if (tcp_cork) - drbd_tcp_uncork(connection->meta.socket); - - /* short circuit, recv_msg would return EINTR anyways. */ - if (signal_pending(current)) - continue; - + pre_recv_jif = jiffies; rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0); - clear_bit(SIGNAL_ASENDER, &connection->flags); - - flush_signals(current); /* Note: * -EINTR (on meta) we got a signal @@ -5562,7 +5558,6 @@ int drbd_asender(struct drbd_thread *thi) * rv < expected: "woken" by signal during receive * rv == 0 : "connection shut down by peer" */ -received_more: if (likely(rv > 0)) { received += rv; buf += rv; @@ -5584,8 +5579,7 @@ received_more: } else if (rv == -EAGAIN) { /* If the data socket received something meanwhile, * that is good enough: peer is still alive. */ - if (time_after(connection->last_received, - jiffies - connection->meta.socket->sk->sk_rcvtimeo)) + if (time_after(connection->last_received, pre_recv_jif)) continue; if (ping_timeout_active) { drbd_err(connection, "PingAck did not arrive in time.\n"); @@ -5594,6 +5588,10 @@ received_more: set_bit(SEND_PING, &connection->flags); continue; } else if (rv == -EINTR) { + /* maybe drbd_thread_stop(): the while condition will notice. + * maybe woken for send_ping: we'll send a ping above, + * and change the rcvtimeo */ + flush_signals(current); continue; } else { drbd_err(connection, "sock_recvmsg returned %d\n", rv); @@ -5603,8 +5601,8 @@ received_more: if (received == expect && cmd == NULL) { if (decode_header(connection, connection->meta.rbuf, &pi)) goto reconnect; - cmd = &asender_tbl[pi.cmd]; - if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) { + cmd = &ack_receiver_tbl[pi.cmd]; + if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) { drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n", cmdname(pi.cmd), pi.cmd); goto disconnect; @@ -5627,9 +5625,8 @@ received_more: connection->last_received = jiffies; - if (cmd == &asender_tbl[P_PING_ACK]) { - /* restore idle timeout */ - connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ; + if (cmd == &ack_receiver_tbl[P_PING_ACK]) { + set_idle_timeout(connection); ping_timeout_active = false; } @@ -5638,11 +5635,6 @@ received_more: expect = header_size; cmd = NULL; } - if (test_bit(SEND_PING, &connection->flags)) - continue; - rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT); - if (rv > 0) - goto received_more; } if (0) { @@ -5654,9 +5646,41 @@ reconnect: disconnect: conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); } - clear_bit(SIGNAL_ASENDER, &connection->flags); - drbd_info(connection, "asender terminated\n"); + drbd_info(connection, "ack_receiver terminated\n"); return 0; } + +void drbd_send_acks_wf(struct work_struct *ws) +{ + struct drbd_peer_device *peer_device = + container_of(ws, struct drbd_peer_device, send_acks_work); + struct drbd_connection *connection = peer_device->connection; + struct drbd_device *device = peer_device->device; + struct net_conf *nc; + int tcp_cork, err; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + tcp_cork = nc->tcp_cork; + rcu_read_unlock(); + + if (tcp_cork) + drbd_tcp_cork(connection->meta.socket); + + err = drbd_finish_peer_reqs(device); + kref_put(&device->kref, drbd_destroy_device); + /* get is in drbd_endio_write_sec_final(). That is necessary to keep the + struct work_struct send_acks_work alive, which is in the peer_device object */ + + if (err) { + conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); + return; + } + + if (tcp_cork) + drbd_tcp_uncork(connection->meta.socket); + + return; +} diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3ae2c0086563..2255dcfebd2b 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -453,12 +453,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, kref_get(&req->kref); /* wait for the DONE */ if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) { - /* potentially already completed in the asender thread */ + /* potentially already completed in the ack_receiver thread */ if (!(s & RQ_NET_DONE)) { atomic_add(req->i.size >> 9, &device->ap_in_flight); set_if_null_req_not_net_done(peer_device, req); } - if (s & RQ_NET_PENDING) + if (req->rq_state & RQ_NET_PENDING) set_if_null_req_ack_pending(peer_device, req); } @@ -1095,6 +1095,24 @@ static bool do_remote_read(struct drbd_request *req) return false; } +bool drbd_should_do_remote(union drbd_dev_state s) +{ + return s.pdsk == D_UP_TO_DATE || + (s.pdsk >= D_INCONSISTENT && + s.conn >= C_WF_BITMAP_T && + s.conn < C_AHEAD); + /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T. + That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* + states. */ +} + +static bool drbd_should_send_out_of_sync(union drbd_dev_state s) +{ + return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; + /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary + since we enter state C_AHEAD only if proto >= 96 */ +} + /* returns number of connections (== 1, for drbd 8.4) * expected to actually write this data, * which does NOT include those that we are L_AHEAD for. */ @@ -1149,7 +1167,6 @@ drbd_submit_req_private_bio(struct drbd_request *req) * stable storage, and this is a WRITE, we may not even submit * this bio. */ if (get_ldev(device)) { - req->pre_submit_jif = jiffies; if (drbd_insert_fault(device, rw == WRITE ? DRBD_FAULT_DT_WR : rw == READ ? DRBD_FAULT_DT_RD @@ -1293,6 +1310,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request &device->pending_master_completion[rw == WRITE]); if (req->private_bio) { /* needs to be marked within the same spinlock */ + req->pre_submit_jif = jiffies; list_add_tail(&req->req_pending_local, &device->pending_completion[rw == WRITE]); _req_mod(req, TO_BE_SUBMITTED); @@ -1513,6 +1531,78 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } +static bool net_timeout_reached(struct drbd_request *net_req, + struct drbd_connection *connection, + unsigned long now, unsigned long ent, + unsigned int ko_count, unsigned int timeout) +{ + struct drbd_device *device = net_req->device; + + if (!time_after(now, net_req->pre_send_jif + ent)) + return false; + + if (time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) + return false; + + if (net_req->rq_state & RQ_NET_PENDING) { + drbd_warn(device, "Remote failed to finish a request within %ums > ko-count (%u) * timeout (%u * 0.1s)\n", + jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout); + return true; + } + + /* We received an ACK already (or are using protocol A), + * but are waiting for the epoch closing barrier ack. + * Check if we sent the barrier already. We should not blame the peer + * for being unresponsive, if we did not even ask it yet. */ + if (net_req->epoch == connection->send.current_epoch_nr) { + drbd_warn(device, + "We did not send a P_BARRIER for %ums > ko-count (%u) * timeout (%u * 0.1s); drbd kernel thread blocked?\n", + jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout); + return false; + } + + /* Worst case: we may have been blocked for whatever reason, then + * suddenly are able to send a lot of requests (and epoch separating + * barriers) in quick succession. + * The timestamp of the net_req may be much too old and not correspond + * to the sending time of the relevant unack'ed barrier packet, so + * would trigger a spurious timeout. The latest barrier packet may + * have a too recent timestamp to trigger the timeout, potentially miss + * a timeout. Right now we don't have a place to conveniently store + * these timestamps. + * But in this particular situation, the application requests are still + * completed to upper layers, DRBD should still "feel" responsive. + * No need yet to kill this connection, it may still recover. + * If not, eventually we will have queued enough into the network for + * us to block. From that point of view, the timestamp of the last sent + * barrier packet is relevant enough. + */ + if (time_after(now, connection->send.last_sent_barrier_jif + ent)) { + drbd_warn(device, "Remote failed to answer a P_BARRIER (sent at %lu jif; now=%lu jif) within %ums > ko-count (%u) * timeout (%u * 0.1s)\n", + connection->send.last_sent_barrier_jif, now, + jiffies_to_msecs(now - connection->send.last_sent_barrier_jif), ko_count, timeout); + return true; + } + return false; +} + +/* A request is considered timed out, if + * - we have some effective timeout from the configuration, + * with some state restrictions applied, + * - the oldest request is waiting for a response from the network + * resp. the local disk, + * - the oldest request is in fact older than the effective timeout, + * - the connection was established (resp. disk was attached) + * for longer than the timeout already. + * Note that for 32bit jiffies and very stable connections/disks, + * we may have a wrap around, which is catched by + * !time_in_range(now, last_..._jif, last_..._jif + timeout). + * + * Side effect: once per 32bit wrap-around interval, which means every + * ~198 days with 250 HZ, we have a window where the timeout would need + * to expire twice (worst case) to become effective. Good enough. + */ + void request_timer_fn(unsigned long data) { struct drbd_device *device = (struct drbd_device *) data; @@ -1522,11 +1612,14 @@ void request_timer_fn(unsigned long data) unsigned long oldest_submit_jif; unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ unsigned long now; + unsigned int ko_count = 0, timeout = 0; rcu_read_lock(); nc = rcu_dereference(connection->net_conf); - if (nc && device->state.conn >= C_WF_REPORT_PARAMS) - ent = nc->timeout * HZ/10 * nc->ko_count; + if (nc && device->state.conn >= C_WF_REPORT_PARAMS) { + ko_count = nc->ko_count; + timeout = nc->timeout; + } if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */ dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10; @@ -1534,6 +1627,8 @@ void request_timer_fn(unsigned long data) } rcu_read_unlock(); + + ent = timeout * HZ/10 * ko_count; et = min_not_zero(dt, ent); if (!et) @@ -1545,11 +1640,22 @@ void request_timer_fn(unsigned long data) spin_lock_irq(&device->resource->req_lock); req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local); req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local); - req_peer = connection->req_not_net_done; + /* maybe the oldest request waiting for the peer is in fact still - * blocking in tcp sendmsg */ - if (!req_peer && connection->req_next && connection->req_next->pre_send_jif) - req_peer = connection->req_next; + * blocking in tcp sendmsg. That's ok, though, that's handled via the + * socket send timeout, requesting a ping, and bumping ko-count in + * we_should_drop_the_connection(). + */ + + /* check the oldest request we did successfully sent, + * but which is still waiting for an ACK. */ + req_peer = connection->req_ack_pending; + + /* if we don't have such request (e.g. protocoll A) + * check the oldest requests which is still waiting on its epoch + * closing barrier ack. */ + if (!req_peer) + req_peer = connection->req_not_net_done; /* evaluate the oldest peer request only in one timer! */ if (req_peer && req_peer->device != device) @@ -1566,28 +1672,9 @@ void request_timer_fn(unsigned long data) : req_write ? req_write->pre_submit_jif : req_read ? req_read->pre_submit_jif : now; - /* The request is considered timed out, if - * - we have some effective timeout from the configuration, - * with above state restrictions applied, - * - the oldest request is waiting for a response from the network - * resp. the local disk, - * - the oldest request is in fact older than the effective timeout, - * - the connection was established (resp. disk was attached) - * for longer than the timeout already. - * Note that for 32bit jiffies and very stable connections/disks, - * we may have a wrap around, which is catched by - * !time_in_range(now, last_..._jif, last_..._jif + timeout). - * - * Side effect: once per 32bit wrap-around interval, which means every - * ~198 days with 250 HZ, we have a window where the timeout would need - * to expire twice (worst case) to become effective. Good enough. - */ - if (ent && req_peer && - time_after(now, req_peer->pre_send_jif + ent) && - !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { - drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); + if (ent && req_peer && net_timeout_reached(req_peer, connection, now, ent, ko_count, timeout)) _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD); - } + if (dt && oldest_submit_jif != now && time_after(now, oldest_submit_jif + dt) && !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 9f6a04080e9f..bb2ef78165e5 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -331,21 +331,6 @@ static inline int req_mod(struct drbd_request *req, return rv; } -static inline bool drbd_should_do_remote(union drbd_dev_state s) -{ - return s.pdsk == D_UP_TO_DATE || - (s.pdsk >= D_INCONSISTENT && - s.conn >= C_WF_BITMAP_T && - s.conn < C_AHEAD); - /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T. - That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* - states. */ -} -static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s) -{ - return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; - /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary - since we enter state C_AHEAD only if proto >= 96 */ -} +extern bool drbd_should_do_remote(union drbd_dev_state); #endif diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 2d7dd269b6a8..5a7ef7873b67 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -29,6 +29,7 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" +#include "drbd_state_change.h" struct after_state_chg_work { struct drbd_work w; @@ -37,6 +38,7 @@ struct after_state_chg_work { union drbd_state ns; enum chg_state_flags flags; struct completion *done; + struct drbd_state_change *state_change; }; enum sanitize_state_warnings { @@ -48,9 +50,248 @@ enum sanitize_state_warnings { IMPLICITLY_UPGRADED_PDSK, }; +static void count_objects(struct drbd_resource *resource, + unsigned int *n_devices, + unsigned int *n_connections) +{ + struct drbd_device *device; + struct drbd_connection *connection; + int vnr; + + *n_devices = 0; + *n_connections = 0; + + idr_for_each_entry(&resource->devices, device, vnr) + (*n_devices)++; + for_each_connection(connection, resource) + (*n_connections)++; +} + +static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp) +{ + struct drbd_state_change *state_change; + unsigned int size, n; + + size = sizeof(struct drbd_state_change) + + n_devices * sizeof(struct drbd_device_state_change) + + n_connections * sizeof(struct drbd_connection_state_change) + + n_devices * n_connections * sizeof(struct drbd_peer_device_state_change); + state_change = kmalloc(size, gfp); + if (!state_change) + return NULL; + state_change->n_devices = n_devices; + state_change->n_connections = n_connections; + state_change->devices = (void *)(state_change + 1); + state_change->connections = (void *)&state_change->devices[n_devices]; + state_change->peer_devices = (void *)&state_change->connections[n_connections]; + state_change->resource->resource = NULL; + for (n = 0; n < n_devices; n++) + state_change->devices[n].device = NULL; + for (n = 0; n < n_connections; n++) + state_change->connections[n].connection = NULL; + return state_change; +} + +struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp_t gfp) +{ + struct drbd_state_change *state_change; + struct drbd_device *device; + unsigned int n_devices; + struct drbd_connection *connection; + unsigned int n_connections; + int vnr; + + struct drbd_device_state_change *device_state_change; + struct drbd_peer_device_state_change *peer_device_state_change; + struct drbd_connection_state_change *connection_state_change; + + /* Caller holds req_lock spinlock. + * No state, no device IDR, no connections lists can change. */ + count_objects(resource, &n_devices, &n_connections); + state_change = alloc_state_change(n_devices, n_connections, gfp); + if (!state_change) + return NULL; + + kref_get(&resource->kref); + state_change->resource->resource = resource; + state_change->resource->role[OLD] = + conn_highest_role(first_connection(resource)); + state_change->resource->susp[OLD] = resource->susp; + state_change->resource->susp_nod[OLD] = resource->susp_nod; + state_change->resource->susp_fen[OLD] = resource->susp_fen; + + connection_state_change = state_change->connections; + for_each_connection(connection, resource) { + kref_get(&connection->kref); + connection_state_change->connection = connection; + connection_state_change->cstate[OLD] = + connection->cstate; + connection_state_change->peer_role[OLD] = + conn_highest_peer(connection); + connection_state_change++; + } + + device_state_change = state_change->devices; + peer_device_state_change = state_change->peer_devices; + idr_for_each_entry(&resource->devices, device, vnr) { + kref_get(&device->kref); + device_state_change->device = device; + device_state_change->disk_state[OLD] = device->state.disk; + + /* The peer_devices for each device have to be enumerated in + the order of the connections. We may not use for_each_peer_device() here. */ + for_each_connection(connection, resource) { + struct drbd_peer_device *peer_device; + + peer_device = conn_peer_device(connection, device->vnr); + peer_device_state_change->peer_device = peer_device; + peer_device_state_change->disk_state[OLD] = + device->state.pdsk; + peer_device_state_change->repl_state[OLD] = + max_t(enum drbd_conns, + C_WF_REPORT_PARAMS, device->state.conn); + peer_device_state_change->resync_susp_user[OLD] = + device->state.user_isp; + peer_device_state_change->resync_susp_peer[OLD] = + device->state.peer_isp; + peer_device_state_change->resync_susp_dependency[OLD] = + device->state.aftr_isp; + peer_device_state_change++; + } + device_state_change++; + } + + return state_change; +} + +static void remember_new_state(struct drbd_state_change *state_change) +{ + struct drbd_resource_state_change *resource_state_change; + struct drbd_resource *resource; + unsigned int n; + + if (!state_change) + return; + + resource_state_change = &state_change->resource[0]; + resource = resource_state_change->resource; + + resource_state_change->role[NEW] = + conn_highest_role(first_connection(resource)); + resource_state_change->susp[NEW] = resource->susp; + resource_state_change->susp_nod[NEW] = resource->susp_nod; + resource_state_change->susp_fen[NEW] = resource->susp_fen; + + for (n = 0; n < state_change->n_devices; n++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n]; + struct drbd_device *device = device_state_change->device; + + device_state_change->disk_state[NEW] = device->state.disk; + } + + for (n = 0; n < state_change->n_connections; n++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n]; + struct drbd_connection *connection = + connection_state_change->connection; + + connection_state_change->cstate[NEW] = connection->cstate; + connection_state_change->peer_role[NEW] = + conn_highest_peer(connection); + } + + for (n = 0; n < state_change->n_devices * state_change->n_connections; n++) { + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[n]; + struct drbd_device *device = + peer_device_state_change->peer_device->device; + union drbd_dev_state state = device->state; + + peer_device_state_change->disk_state[NEW] = state.pdsk; + peer_device_state_change->repl_state[NEW] = + max_t(enum drbd_conns, C_WF_REPORT_PARAMS, state.conn); + peer_device_state_change->resync_susp_user[NEW] = + state.user_isp; + peer_device_state_change->resync_susp_peer[NEW] = + state.peer_isp; + peer_device_state_change->resync_susp_dependency[NEW] = + state.aftr_isp; + } +} + +void copy_old_to_new_state_change(struct drbd_state_change *state_change) +{ + struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; + unsigned int n_device, n_connection, n_peer_device, n_peer_devices; + +#define OLD_TO_NEW(x) \ + (x[NEW] = x[OLD]) + + OLD_TO_NEW(resource_state_change->role); + OLD_TO_NEW(resource_state_change->susp); + OLD_TO_NEW(resource_state_change->susp_nod); + OLD_TO_NEW(resource_state_change->susp_fen); + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n_connection]; + + OLD_TO_NEW(connection_state_change->peer_role); + OLD_TO_NEW(connection_state_change->cstate); + } + + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n_device]; + + OLD_TO_NEW(device_state_change->disk_state); + } + + n_peer_devices = state_change->n_devices * state_change->n_connections; + for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) { + struct drbd_peer_device_state_change *p = + &state_change->peer_devices[n_peer_device]; + + OLD_TO_NEW(p->disk_state); + OLD_TO_NEW(p->repl_state); + OLD_TO_NEW(p->resync_susp_user); + OLD_TO_NEW(p->resync_susp_peer); + OLD_TO_NEW(p->resync_susp_dependency); + } + +#undef OLD_TO_NEW +} + +void forget_state_change(struct drbd_state_change *state_change) +{ + unsigned int n; + + if (!state_change) + return; + + if (state_change->resource->resource) + kref_put(&state_change->resource->resource->kref, drbd_destroy_resource); + for (n = 0; n < state_change->n_devices; n++) { + struct drbd_device *device = state_change->devices[n].device; + + if (device) + kref_put(&device->kref, drbd_destroy_device); + } + for (n = 0; n < state_change->n_connections; n++) { + struct drbd_connection *connection = + state_change->connections[n].connection; + + if (connection) + kref_put(&connection->kref, drbd_destroy_connection); + } + kfree(state_change); +} + static int w_after_state_ch(struct drbd_work *w, int unused); static void after_state_ch(struct drbd_device *device, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags); + union drbd_state ns, enum chg_state_flags flags, + struct drbd_state_change *); static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); @@ -93,6 +334,7 @@ static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2) return R_SECONDARY; return R_UNKNOWN; } + static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) { if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) @@ -937,7 +1179,7 @@ void drbd_resume_al(struct drbd_device *device) drbd_info(device, "Resumed AL updates\n"); } -/* helper for __drbd_set_state */ +/* helper for _drbd_set_state */ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) { if (first_peer_device(device)->connection->agreed_pro_version < 90) @@ -965,17 +1207,17 @@ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) } /** - * __drbd_set_state() - Set a new DRBD state + * _drbd_set_state() - Set a new DRBD state * @device: DRBD device. * @ns: new state. * @flags: Flags * @done: Optional completion, that will get completed after the after_state_ch() finished * - * Caller needs to hold req_lock, and global_state_lock. Do not call directly. + * Caller needs to hold req_lock. Do not call directly. */ enum drbd_state_rv -__drbd_set_state(struct drbd_device *device, union drbd_state ns, - enum chg_state_flags flags, struct completion *done) +_drbd_set_state(struct drbd_device *device, union drbd_state ns, + enum chg_state_flags flags, struct completion *done) { struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; @@ -983,6 +1225,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, enum drbd_state_rv rv = SS_SUCCESS; enum sanitize_state_warnings ssw; struct after_state_chg_work *ascw; + struct drbd_state_change *state_change; os = drbd_read_state(device); @@ -1037,6 +1280,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, if (!is_sync_state(os.conn) && is_sync_state(ns.conn)) clear_bit(RS_DONE, &device->flags); + /* FIXME: Have any flags been set earlier in this function already? */ + state_change = remember_old_state(device->resource, GFP_ATOMIC); + /* changes to local_cnt and device flags should be visible before * changes to state, which again should be visible before anything else * depending on that change happens. */ @@ -1047,6 +1293,8 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, device->resource->susp_fen = ns.susp_fen; smp_wmb(); + remember_new_state(state_change); + /* put replicated vs not-replicated requests in seperate epochs */ if (drbd_should_do_remote((union drbd_dev_state)os.i) != drbd_should_do_remote((union drbd_dev_state)ns.i)) @@ -1184,6 +1432,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, ascw->w.cb = w_after_state_ch; ascw->device = device; ascw->done = done; + ascw->state_change = state_change; drbd_queue_work(&connection->sender_work, &ascw->w); } else { @@ -1199,7 +1448,8 @@ static int w_after_state_ch(struct drbd_work *w, int unused) container_of(w, struct after_state_chg_work, w); struct drbd_device *device = ascw->device; - after_state_ch(device, ascw->os, ascw->ns, ascw->flags); + after_state_ch(device, ascw->os, ascw->ns, ascw->flags, ascw->state_change); + forget_state_change(ascw->state_change); if (ascw->flags & CS_WAIT_COMPLETE) complete(ascw->done); kfree(ascw); @@ -1234,7 +1484,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); /* open coded non-blocking drbd_suspend_io(device); */ - set_bit(SUSPEND_IO, &device->flags); + atomic_inc(&device->suspend_cnt); drbd_bm_lock(device, why, flags); rv = io_fn(device); @@ -1245,6 +1495,139 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, return rv; } +void notify_resource_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_resource_state_change *resource_state_change, + enum drbd_notification_type type) +{ + struct drbd_resource *resource = resource_state_change->resource; + struct resource_info resource_info = { + .res_role = resource_state_change->role[NEW], + .res_susp = resource_state_change->susp[NEW], + .res_susp_nod = resource_state_change->susp_nod[NEW], + .res_susp_fen = resource_state_change->susp_fen[NEW], + }; + + notify_resource_state(skb, seq, resource, &resource_info, type); +} + +void notify_connection_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_connection_state_change *connection_state_change, + enum drbd_notification_type type) +{ + struct drbd_connection *connection = connection_state_change->connection; + struct connection_info connection_info = { + .conn_connection_state = connection_state_change->cstate[NEW], + .conn_role = connection_state_change->peer_role[NEW], + }; + + notify_connection_state(skb, seq, connection, &connection_info, type); +} + +void notify_device_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_device_state_change *device_state_change, + enum drbd_notification_type type) +{ + struct drbd_device *device = device_state_change->device; + struct device_info device_info = { + .dev_disk_state = device_state_change->disk_state[NEW], + }; + + notify_device_state(skb, seq, device, &device_info, type); +} + +void notify_peer_device_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_peer_device_state_change *p, + enum drbd_notification_type type) +{ + struct drbd_peer_device *peer_device = p->peer_device; + struct peer_device_info peer_device_info = { + .peer_repl_state = p->repl_state[NEW], + .peer_disk_state = p->disk_state[NEW], + .peer_resync_susp_user = p->resync_susp_user[NEW], + .peer_resync_susp_peer = p->resync_susp_peer[NEW], + .peer_resync_susp_dependency = p->resync_susp_dependency[NEW], + }; + + notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type); +} + +static void broadcast_state_change(struct drbd_state_change *state_change) +{ + struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; + bool resource_state_has_changed; + unsigned int n_device, n_connection, n_peer_device, n_peer_devices; + void (*last_func)(struct sk_buff *, unsigned int, void *, + enum drbd_notification_type) = NULL; + void *uninitialized_var(last_arg); + +#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW]) +#define FINAL_STATE_CHANGE(type) \ + ({ if (last_func) \ + last_func(NULL, 0, last_arg, type); \ + }) +#define REMEMBER_STATE_CHANGE(func, arg, type) \ + ({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \ + last_func = (typeof(last_func))func; \ + last_arg = arg; \ + }) + + mutex_lock(¬ification_mutex); + + resource_state_has_changed = + HAS_CHANGED(resource_state_change->role) || + HAS_CHANGED(resource_state_change->susp) || + HAS_CHANGED(resource_state_change->susp_nod) || + HAS_CHANGED(resource_state_change->susp_fen); + + if (resource_state_has_changed) + REMEMBER_STATE_CHANGE(notify_resource_state_change, + resource_state_change, NOTIFY_CHANGE); + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n_connection]; + + if (HAS_CHANGED(connection_state_change->peer_role) || + HAS_CHANGED(connection_state_change->cstate)) + REMEMBER_STATE_CHANGE(notify_connection_state_change, + connection_state_change, NOTIFY_CHANGE); + } + + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n_device]; + + if (HAS_CHANGED(device_state_change->disk_state)) + REMEMBER_STATE_CHANGE(notify_device_state_change, + device_state_change, NOTIFY_CHANGE); + } + + n_peer_devices = state_change->n_devices * state_change->n_connections; + for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) { + struct drbd_peer_device_state_change *p = + &state_change->peer_devices[n_peer_device]; + + if (HAS_CHANGED(p->disk_state) || + HAS_CHANGED(p->repl_state) || + HAS_CHANGED(p->resync_susp_user) || + HAS_CHANGED(p->resync_susp_peer) || + HAS_CHANGED(p->resync_susp_dependency)) + REMEMBER_STATE_CHANGE(notify_peer_device_state_change, + p, NOTIFY_CHANGE); + } + + FINAL_STATE_CHANGE(NOTIFY_CHANGE); + mutex_unlock(¬ification_mutex); + +#undef HAS_CHANGED +#undef FINAL_STATE_CHANGE +#undef REMEMBER_STATE_CHANGE +} + /** * after_state_ch() - Perform after state change actions that may sleep * @device: DRBD device. @@ -1253,13 +1636,16 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, * @flags: Flags */ static void after_state_ch(struct drbd_device *device, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags) + union drbd_state ns, enum chg_state_flags flags, + struct drbd_state_change *state_change) { struct drbd_resource *resource = device->resource; struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; struct sib_info sib; + broadcast_state_change(state_change); + sib.sib_reason = SIB_STATE_CHANGE; sib.os = os; sib.ns = ns; @@ -1377,7 +1763,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, } if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) { - if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && + if (os.peer != R_PRIMARY && ns.peer == R_PRIMARY && device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { drbd_uuid_new_current(device); drbd_send_uuids(peer_device); @@ -1444,7 +1830,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, if (os.disk != D_FAILED && ns.disk == D_FAILED) { enum drbd_io_error_p eh = EP_PASS_ON; int was_io_error = 0; - /* corresponding get_ldev was in __drbd_set_state, to serialize + /* corresponding get_ldev was in _drbd_set_state, to serialize * our cleanup here with the transition to D_DISKLESS. * But is is still not save to dreference ldev here, since * we might come from an failed Attach before ldev was set. */ @@ -1455,6 +1841,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags); + /* Intentionally call this handler first, before drbd_send_state(). + * See: 2932204 drbd: call local-io-error handler early + * People may chose to hard-reset the box from this handler. + * It is useful if this looks like a "regular node crash". */ if (was_io_error && eh == EP_CALL_HELPER) drbd_khelper(device, "local-io-error"); @@ -1572,6 +1962,7 @@ struct after_conn_state_chg_work { union drbd_state ns_max; /* new, max state, over all devices */ enum chg_state_flags flags; struct drbd_connection *connection; + struct drbd_state_change *state_change; }; static int w_after_conn_state_ch(struct drbd_work *w, int unused) @@ -1584,6 +1975,8 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) struct drbd_peer_device *peer_device; int vnr; + broadcast_state_change(acscw->state_change); + forget_state_change(acscw->state_change); kfree(acscw); /* Upon network configuration, we need to start the receiver */ @@ -1593,6 +1986,13 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { struct net_conf *old_conf; + mutex_lock(¬ification_mutex); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + notify_peer_device_state(NULL, 0, peer_device, NULL, + NOTIFY_DESTROY | NOTIFY_CONTINUES); + notify_connection_state(NULL, 0, connection, NULL, NOTIFY_DESTROY); + mutex_unlock(¬ification_mutex); + mutex_lock(&connection->resource->conf_update); old_conf = connection->net_conf; connection->my_addr_len = 0; @@ -1759,7 +2159,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) ns.disk = os.disk; - rv = __drbd_set_state(device, ns, flags, NULL); + rv = _drbd_set_state(device, ns, flags, NULL); if (rv < SS_SUCCESS) BUG(); @@ -1823,6 +2223,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u enum drbd_conns oc = connection->cstate; union drbd_state ns_max, ns_min, os; bool have_mutex = false; + struct drbd_state_change *state_change; if (mask.conn) { rv = is_valid_conn_transition(oc, val.conn); @@ -1868,10 +2269,12 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u goto abort; } + state_change = remember_old_state(connection->resource, GFP_ATOMIC); conn_old_common_state(connection, &os, &flags); flags |= CS_DC_SUSP; conn_set_state(connection, mask, val, &ns_min, &ns_max, flags); conn_pr_state_change(connection, os, ns_max, flags); + remember_new_state(state_change); acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC); if (acscw) { @@ -1882,6 +2285,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u acscw->w.cb = w_after_conn_state_ch; kref_get(&connection->kref); acscw->connection = connection; + acscw->state_change = state_change; drbd_queue_work(&connection->sender_work, &acscw->w); } else { drbd_err(connection, "Could not kmalloc an acscw\n"); diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h index 7f53c40823cd..bd989536f888 100644 --- a/drivers/block/drbd/drbd_state.h +++ b/drivers/block/drbd/drbd_state.h @@ -122,9 +122,9 @@ extern enum drbd_state_rv _drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state, union drbd_state, enum chg_state_flags); -extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state, - enum chg_state_flags, - struct completion *done); +extern enum drbd_state_rv _drbd_set_state(struct drbd_device *, union drbd_state, + enum chg_state_flags, + struct completion *done); extern void print_st_err(struct drbd_device *, union drbd_state, union drbd_state, int); diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h new file mode 100644 index 000000000000..9e503a1a0bfb --- /dev/null +++ b/drivers/block/drbd/drbd_state_change.h @@ -0,0 +1,63 @@ +#ifndef DRBD_STATE_CHANGE_H +#define DRBD_STATE_CHANGE_H + +struct drbd_resource_state_change { + struct drbd_resource *resource; + enum drbd_role role[2]; + bool susp[2]; + bool susp_nod[2]; + bool susp_fen[2]; +}; + +struct drbd_device_state_change { + struct drbd_device *device; + enum drbd_disk_state disk_state[2]; +}; + +struct drbd_connection_state_change { + struct drbd_connection *connection; + enum drbd_conns cstate[2]; /* drbd9: enum drbd_conn_state */ + enum drbd_role peer_role[2]; +}; + +struct drbd_peer_device_state_change { + struct drbd_peer_device *peer_device; + enum drbd_disk_state disk_state[2]; + enum drbd_conns repl_state[2]; /* drbd9: enum drbd_repl_state */ + bool resync_susp_user[2]; + bool resync_susp_peer[2]; + bool resync_susp_dependency[2]; +}; + +struct drbd_state_change { + struct list_head list; + unsigned int n_devices; + unsigned int n_connections; + struct drbd_resource_state_change resource[1]; + struct drbd_device_state_change *devices; + struct drbd_connection_state_change *connections; + struct drbd_peer_device_state_change *peer_devices; +}; + +extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_t); +extern void copy_old_to_new_state_change(struct drbd_state_change *); +extern void forget_state_change(struct drbd_state_change *); + +extern void notify_resource_state_change(struct sk_buff *, + unsigned int, + struct drbd_resource_state_change *, + enum drbd_notification_type type); +extern void notify_connection_state_change(struct sk_buff *, + unsigned int, + struct drbd_connection_state_change *, + enum drbd_notification_type type); +extern void notify_device_state_change(struct sk_buff *, + unsigned int, + struct drbd_device_state_change *, + enum drbd_notification_type type); +extern void notify_peer_device_state_change(struct sk_buff *, + unsigned int, + struct drbd_peer_device_state_change *, + enum drbd_notification_type type); + +#endif /* DRBD_STATE_CHANGE_H */ diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 5578c1477ba6..eff716c27b1f 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -55,13 +55,6 @@ static int make_resync_request(struct drbd_device *, int); * */ - -/* About the global_state_lock - Each state transition on an device holds a read lock. In case we have - to evaluate the resync after dependencies, we grab a write lock, because - we need stable states on all devices for that. */ -rwlock_t global_state_lock; - /* used for synchronous meta data and bitmap IO * submitted by drbd_md_sync_page_io() */ @@ -120,6 +113,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l unsigned long flags = 0; struct drbd_peer_device *peer_device = peer_req->peer_device; struct drbd_device *device = peer_device->device; + struct drbd_connection *connection = peer_device->connection; struct drbd_interval i; int do_wake; u64 block_id; @@ -152,6 +146,12 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ if (peer_req->flags & EE_WAS_ERROR) __drbd_chk_io_error(device, DRBD_WRITE_ERROR); + + if (connection->cstate >= C_WF_REPORT_PARAMS) { + kref_get(&device->kref); /* put is in drbd_send_acks_wf() */ + if (!queue_work(connection->ack_sender, &peer_device->send_acks_work)) + kref_put(&device->kref, drbd_destroy_device); + } spin_unlock_irqrestore(&device->resource->req_lock, flags); if (block_id == ID_SYNCER) @@ -163,7 +163,6 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l if (do_al_complete_io) drbd_al_complete_io(device, &i); - wake_asender(peer_device->connection); put_ldev(device); } @@ -195,6 +194,12 @@ void drbd_peer_request_endio(struct bio *bio) } } +void drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device) +{ + panic("drbd%u %s/%u potential random memory corruption caused by delayed completion of aborted local request\n", + device->minor, device->resource->name, device->vnr); +} + /* read, readA or write requests on R_PRIMARY coming from drbd_make_request */ void drbd_request_endio(struct bio *bio) @@ -238,7 +243,7 @@ void drbd_request_endio(struct bio *bio) drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); if (!bio->bi_error) - panic("possible random memory corruption caused by delayed completion of aborted local request\n"); + drbd_panic_after_delayed_completion_of_aborted_request(device); } /* to avoid recursion in __req_mod */ @@ -1291,6 +1296,7 @@ static int drbd_send_barrier(struct drbd_connection *connection) p->barrier = connection->send.current_epoch_nr; p->pad = 0; connection->send.current_epoch_writes = 0; + connection->send.last_sent_barrier_jif = jiffies; return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0); } @@ -1315,6 +1321,7 @@ static void re_init_if_first_write(struct drbd_connection *connection, unsigned connection->send.seen_any_write_yet = true; connection->send.current_epoch_nr = epoch; connection->send.current_epoch_writes = 0; + connection->send.last_sent_barrier_jif = jiffies; } } @@ -1456,70 +1463,73 @@ static int _drbd_may_sync_now(struct drbd_device *device) } /** - * _drbd_pause_after() - Pause resync on all devices that may not resync now + * drbd_pause_after() - Pause resync on all devices that may not resync now * @device: DRBD device. * * Called from process context only (admin command and after_state_ch). */ -static int _drbd_pause_after(struct drbd_device *device) +static bool drbd_pause_after(struct drbd_device *device) { + bool changed = false; struct drbd_device *odev; - int i, rv = 0; + int i; rcu_read_lock(); idr_for_each_entry(&drbd_devices, odev, i) { if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) continue; - if (!_drbd_may_sync_now(odev)) - rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) - != SS_NOTHING_TO_DO); + if (!_drbd_may_sync_now(odev) && + _drbd_set_state(_NS(odev, aftr_isp, 1), + CS_HARD, NULL) != SS_NOTHING_TO_DO) + changed = true; } rcu_read_unlock(); - return rv; + return changed; } /** - * _drbd_resume_next() - Resume resync on all devices that may resync now + * drbd_resume_next() - Resume resync on all devices that may resync now * @device: DRBD device. * * Called from process context only (admin command and worker). */ -static int _drbd_resume_next(struct drbd_device *device) +static bool drbd_resume_next(struct drbd_device *device) { + bool changed = false; struct drbd_device *odev; - int i, rv = 0; + int i; rcu_read_lock(); idr_for_each_entry(&drbd_devices, odev, i) { if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) continue; if (odev->state.aftr_isp) { - if (_drbd_may_sync_now(odev)) - rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), - CS_HARD, NULL) - != SS_NOTHING_TO_DO) ; + if (_drbd_may_sync_now(odev) && + _drbd_set_state(_NS(odev, aftr_isp, 0), + CS_HARD, NULL) != SS_NOTHING_TO_DO) + changed = true; } } rcu_read_unlock(); - return rv; + return changed; } void resume_next_sg(struct drbd_device *device) { - write_lock_irq(&global_state_lock); - _drbd_resume_next(device); - write_unlock_irq(&global_state_lock); + lock_all_resources(); + drbd_resume_next(device); + unlock_all_resources(); } void suspend_other_sg(struct drbd_device *device) { - write_lock_irq(&global_state_lock); - _drbd_pause_after(device); - write_unlock_irq(&global_state_lock); + lock_all_resources(); + drbd_pause_after(device); + unlock_all_resources(); } -/* caller must hold global_state_lock */ +/* caller must lock_all_resources() */ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor) { struct drbd_device *odev; @@ -1557,15 +1567,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_min } } -/* caller must hold global_state_lock */ +/* caller must lock_all_resources() */ void drbd_resync_after_changed(struct drbd_device *device) { - int changes; + int changed; do { - changes = _drbd_pause_after(device); - changes |= _drbd_resume_next(device); - } while (changes); + changed = drbd_pause_after(device); + changed |= drbd_resume_next(device); + } while (changed); } void drbd_rs_controller_reset(struct drbd_device *device) @@ -1685,19 +1695,14 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } else { mutex_lock(device->state_mutex); } - clear_bit(B_RS_H_DONE, &device->flags); - /* req_lock: serialize with drbd_send_and_submit() and others - * global_state_lock: for stable sync-after dependencies */ - spin_lock_irq(&device->resource->req_lock); - write_lock(&global_state_lock); + lock_all_resources(); + clear_bit(B_RS_H_DONE, &device->flags); /* Did some connection breakage or IO error race with us? */ if (device->state.conn < C_CONNECTED || !get_ldev_if_state(device, D_NEGOTIATING)) { - write_unlock(&global_state_lock); - spin_unlock_irq(&device->resource->req_lock); - mutex_unlock(device->state_mutex); - return; + unlock_all_resources(); + goto out; } ns = drbd_read_state(device); @@ -1711,7 +1716,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) else /* side == C_SYNC_SOURCE */ ns.pdsk = D_INCONSISTENT; - r = __drbd_set_state(device, ns, CS_VERBOSE, NULL); + r = _drbd_set_state(device, ns, CS_VERBOSE, NULL); ns = drbd_read_state(device); if (ns.conn < C_CONNECTED) @@ -1732,7 +1737,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) device->rs_mark_left[i] = tw; device->rs_mark_time[i] = now; } - _drbd_pause_after(device); + drbd_pause_after(device); /* Forget potentially stale cached per resync extent bit-counts. * Open coded drbd_rs_cancel_all(device), we already have IRQs * disabled, and know the disk state is ok. */ @@ -1742,8 +1747,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) device->resync_wenr = LC_FREE; spin_unlock(&device->al_lock); } - write_unlock(&global_state_lock); - spin_unlock_irq(&device->resource->req_lock); + unlock_all_resources(); if (r == SS_SUCCESS) { wake_up(&device->al_wait); /* for lc_reset() above */ @@ -1807,6 +1811,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) drbd_md_sync(device); } put_ldev(device); +out: mutex_unlock(device->state_mutex); } @@ -1836,7 +1841,7 @@ static void drbd_ldev_destroy(struct drbd_device *device) device->act_log = NULL; __acquire(local); - drbd_free_ldev(device->ldev); + drbd_backing_dev_free(device, device->ldev); device->ldev = NULL; __release(local); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 15bec407ac37..9b180dbbd03c 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -104,9 +104,9 @@ /* Device instance number, incremented each time a device is probed. */ static int instance; -struct list_head online_list; -struct list_head removing_list; -spinlock_t dev_lock; +static struct list_head online_list; +static struct list_head removing_list; +static spinlock_t dev_lock; /* * Global variable used to hold the major block device number diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 95dff91135ad..8ba1e97d573c 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -436,9 +436,8 @@ static void null_del_dev(struct nullb *nullb) static void null_lnvm_end_io(struct request *rq, int error) { struct nvm_rq *rqd = rq->end_io_data; - struct nvm_dev *dev = rqd->dev; - dev->mt->end_io(rqd, error); + nvm_end_io(rqd, error); blk_put_request(rq); } @@ -495,17 +494,17 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) id->ppaf.ch_offset = 56; id->ppaf.ch_len = 8; - do_div(size, bs); /* convert size to pages */ - do_div(size, 256); /* concert size to pgs pr blk */ + sector_div(size, bs); /* convert size to pages */ + size >>= 8; /* concert size to pgs pr blk */ grp = &id->groups[0]; grp->mtype = 0; grp->fmtype = 0; grp->num_ch = 1; grp->num_pg = 256; blksize = size; - do_div(size, (1 << 16)); + size >>= 16; grp->num_lun = size + 1; - do_div(blksize, grp->num_lun); + sector_div(blksize, grp->num_lun); grp->num_blk = blksize; grp->num_pln = 1; diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 59c91d49b14b..ba4bfe933276 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -23,7 +23,7 @@ #include <linux/workqueue.h> #include <linux/bitops.h> #include <linux/delay.h> -#include <linux/time.h> +#include <linux/ktime.h> #include <linux/hdreg.h> #include <linux/dma-mapping.h> #include <linux/completion.h> @@ -671,16 +671,15 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) static unsigned int carm_fill_sync_time(struct carm_host *host, unsigned int idx, void *mem) { - struct timeval tv; struct carm_msg_sync_time *st = mem; - do_gettimeofday(&tv); + time64_t tv = ktime_get_real_seconds(); memset(st, 0, sizeof(*st)); st->type = CARM_MSG_MISC; st->subtype = MISC_SET_TIME; st->handle = cpu_to_le32(TAG_ENCODE(idx)); - st->timestamp = cpu_to_le32(tv.tv_sec); + st->timestamp = cpu_to_le32(tv); return sizeof(struct carm_msg_sync_time); } diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 41fb1a917b17..4809c1501d7e 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -84,6 +84,16 @@ MODULE_PARM_DESC(max_persistent_grants, "Maximum number of grants to map persistently"); /* + * Maximum number of rings/queues blkback supports, allow as many queues as there + * are CPUs if user has not specified a value. + */ +unsigned int xenblk_max_queues; +module_param_named(max_queues, xenblk_max_queues, uint, 0644); +MODULE_PARM_DESC(max_queues, + "Maximum number of hardware queues per virtual disk." \ + "By default it is the number of online CPUs."); + +/* * Maximum order of pages to be used for the shared ring between front and * backend, 4KB page granularity is used. */ @@ -113,71 +123,71 @@ module_param(log_stats, int, 0644); /* Number of free pages to remove on each call to gnttab_free_pages */ #define NUM_BATCH_FREE_PAGES 10 -static inline int get_free_page(struct xen_blkif *blkif, struct page **page) +static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page) { unsigned long flags; - spin_lock_irqsave(&blkif->free_pages_lock, flags); - if (list_empty(&blkif->free_pages)) { - BUG_ON(blkif->free_pages_num != 0); - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); + if (list_empty(&ring->free_pages)) { + BUG_ON(ring->free_pages_num != 0); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); return gnttab_alloc_pages(1, page); } - BUG_ON(blkif->free_pages_num == 0); - page[0] = list_first_entry(&blkif->free_pages, struct page, lru); + BUG_ON(ring->free_pages_num == 0); + page[0] = list_first_entry(&ring->free_pages, struct page, lru); list_del(&page[0]->lru); - blkif->free_pages_num--; - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + ring->free_pages_num--; + spin_unlock_irqrestore(&ring->free_pages_lock, flags); return 0; } -static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, +static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page, int num) { unsigned long flags; int i; - spin_lock_irqsave(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); for (i = 0; i < num; i++) - list_add(&page[i]->lru, &blkif->free_pages); - blkif->free_pages_num += num; - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + list_add(&page[i]->lru, &ring->free_pages); + ring->free_pages_num += num; + spin_unlock_irqrestore(&ring->free_pages_lock, flags); } -static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) +static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num) { /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ struct page *page[NUM_BATCH_FREE_PAGES]; unsigned int num_pages = 0; unsigned long flags; - spin_lock_irqsave(&blkif->free_pages_lock, flags); - while (blkif->free_pages_num > num) { - BUG_ON(list_empty(&blkif->free_pages)); - page[num_pages] = list_first_entry(&blkif->free_pages, + spin_lock_irqsave(&ring->free_pages_lock, flags); + while (ring->free_pages_num > num) { + BUG_ON(list_empty(&ring->free_pages)); + page[num_pages] = list_first_entry(&ring->free_pages, struct page, lru); list_del(&page[num_pages]->lru); - blkif->free_pages_num--; + ring->free_pages_num--; if (++num_pages == NUM_BATCH_FREE_PAGES) { - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); gnttab_free_pages(num_pages, page); - spin_lock_irqsave(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); num_pages = 0; } } - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); if (num_pages != 0) gnttab_free_pages(num_pages, page); } #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) -static int do_block_io_op(struct xen_blkif *blkif); -static int dispatch_rw_block_io(struct xen_blkif *blkif, +static int do_block_io_op(struct xen_blkif_ring *ring); +static int dispatch_rw_block_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req); -static void make_response(struct xen_blkif *blkif, u64 id, +static void make_response(struct xen_blkif_ring *ring, u64 id, unsigned short op, int st); #define foreach_grant_safe(pos, n, rbtree, node) \ @@ -190,7 +200,7 @@ static void make_response(struct xen_blkif *blkif, u64 id, /* * We don't need locking around the persistent grant helpers - * because blkback uses a single-thread for each backed, so we + * because blkback uses a single-thread for each backend, so we * can be sure that this functions will never be called recursively. * * The only exception to that is put_persistent_grant, that can be called @@ -198,19 +208,20 @@ static void make_response(struct xen_blkif *blkif, u64 id, * bit operations to modify the flags of a persistent grant and to count * the number of used grants. */ -static int add_persistent_gnt(struct xen_blkif *blkif, +static int add_persistent_gnt(struct xen_blkif_ring *ring, struct persistent_gnt *persistent_gnt) { struct rb_node **new = NULL, *parent = NULL; struct persistent_gnt *this; + struct xen_blkif *blkif = ring->blkif; - if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { + if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) { if (!blkif->vbd.overflow_max_grants) blkif->vbd.overflow_max_grants = 1; return -EBUSY; } /* Figure out where to put new node */ - new = &blkif->persistent_gnts.rb_node; + new = &ring->persistent_gnts.rb_node; while (*new) { this = container_of(*new, struct persistent_gnt, node); @@ -229,19 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif, set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); /* Add new node and rebalance tree. */ rb_link_node(&(persistent_gnt->node), parent, new); - rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); - blkif->persistent_gnt_c++; - atomic_inc(&blkif->persistent_gnt_in_use); + rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts); + ring->persistent_gnt_c++; + atomic_inc(&ring->persistent_gnt_in_use); return 0; } -static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, +static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring, grant_ref_t gref) { struct persistent_gnt *data; struct rb_node *node = NULL; - node = blkif->persistent_gnts.rb_node; + node = ring->persistent_gnts.rb_node; while (node) { data = container_of(node, struct persistent_gnt, node); @@ -255,24 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, return NULL; } set_bit(PERSISTENT_GNT_ACTIVE, data->flags); - atomic_inc(&blkif->persistent_gnt_in_use); + atomic_inc(&ring->persistent_gnt_in_use); return data; } } return NULL; } -static void put_persistent_gnt(struct xen_blkif *blkif, +static void put_persistent_gnt(struct xen_blkif_ring *ring, struct persistent_gnt *persistent_gnt) { if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) pr_alert_ratelimited("freeing a grant already unused\n"); set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); - atomic_dec(&blkif->persistent_gnt_in_use); + atomic_dec(&ring->persistent_gnt_in_use); } -static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, +static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root, unsigned int num) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; @@ -303,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); segs_to_unmap = 0; } @@ -320,15 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct persistent_gnt *persistent_gnt; int segs_to_unmap = 0; - struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); + struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work); struct gntab_unmap_queue_data unmap_data; unmap_data.pages = pages; unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; - while(!list_empty(&blkif->persistent_purge_list)) { - persistent_gnt = list_first_entry(&blkif->persistent_purge_list, + while(!list_empty(&ring->persistent_purge_list)) { + persistent_gnt = list_first_entry(&ring->persistent_purge_list, struct persistent_gnt, remove_node); list_del(&persistent_gnt->remove_node); @@ -343,7 +354,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); segs_to_unmap = 0; } kfree(persistent_gnt); @@ -351,11 +362,11 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) if (segs_to_unmap > 0) { unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); } } -static void purge_persistent_gnt(struct xen_blkif *blkif) +static void purge_persistent_gnt(struct xen_blkif_ring *ring) { struct persistent_gnt *persistent_gnt; struct rb_node *n; @@ -363,23 +374,23 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) bool scan_used = false, clean_used = false; struct rb_root *root; - if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || - (blkif->persistent_gnt_c == xen_blkif_max_pgrants && - !blkif->vbd.overflow_max_grants)) { - return; + if (ring->persistent_gnt_c < xen_blkif_max_pgrants || + (ring->persistent_gnt_c == xen_blkif_max_pgrants && + !ring->blkif->vbd.overflow_max_grants)) { + goto out; } - if (work_busy(&blkif->persistent_purge_work)) { + if (work_busy(&ring->persistent_purge_work)) { pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); - return; + goto out; } num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; - num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; - num_clean = min(blkif->persistent_gnt_c, num_clean); + num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; + num_clean = min(ring->persistent_gnt_c, num_clean); if ((num_clean == 0) || - (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) - return; + (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use)))) + goto out; /* * At this point, we can assure that there will be no calls @@ -394,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) pr_debug("Going to purge %u persistent grants\n", num_clean); - BUG_ON(!list_empty(&blkif->persistent_purge_list)); - root = &blkif->persistent_gnts; + BUG_ON(!list_empty(&ring->persistent_purge_list)); + root = &ring->persistent_gnts; purge_list: foreach_grant_safe(persistent_gnt, n, root, node) { BUG_ON(persistent_gnt->handle == @@ -414,7 +425,7 @@ purge_list: rb_erase(&persistent_gnt->node, root); list_add(&persistent_gnt->remove_node, - &blkif->persistent_purge_list); + &ring->persistent_purge_list); if (--num_clean == 0) goto finished; } @@ -435,30 +446,32 @@ finished: goto purge_list; } - blkif->persistent_gnt_c -= (total - num_clean); - blkif->vbd.overflow_max_grants = 0; + ring->persistent_gnt_c -= (total - num_clean); + ring->blkif->vbd.overflow_max_grants = 0; /* We can defer this work */ - schedule_work(&blkif->persistent_purge_work); + schedule_work(&ring->persistent_purge_work); pr_debug("Purged %u/%u\n", (total - num_clean), total); + +out: return; } /* * Retrieve from the 'pending_reqs' a free pending_req structure to be used. */ -static struct pending_req *alloc_req(struct xen_blkif *blkif) +static struct pending_req *alloc_req(struct xen_blkif_ring *ring) { struct pending_req *req = NULL; unsigned long flags; - spin_lock_irqsave(&blkif->pending_free_lock, flags); - if (!list_empty(&blkif->pending_free)) { - req = list_entry(blkif->pending_free.next, struct pending_req, + spin_lock_irqsave(&ring->pending_free_lock, flags); + if (!list_empty(&ring->pending_free)) { + req = list_entry(ring->pending_free.next, struct pending_req, free_list); list_del(&req->free_list); } - spin_unlock_irqrestore(&blkif->pending_free_lock, flags); + spin_unlock_irqrestore(&ring->pending_free_lock, flags); return req; } @@ -466,17 +479,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif) * Return the 'pending_req' structure back to the freepool. We also * wake up the thread if it was waiting for a free page. */ -static void free_req(struct xen_blkif *blkif, struct pending_req *req) +static void free_req(struct xen_blkif_ring *ring, struct pending_req *req) { unsigned long flags; int was_empty; - spin_lock_irqsave(&blkif->pending_free_lock, flags); - was_empty = list_empty(&blkif->pending_free); - list_add(&req->free_list, &blkif->pending_free); - spin_unlock_irqrestore(&blkif->pending_free_lock, flags); + spin_lock_irqsave(&ring->pending_free_lock, flags); + was_empty = list_empty(&ring->pending_free); + list_add(&req->free_list, &ring->pending_free); + spin_unlock_irqrestore(&ring->pending_free_lock, flags); if (was_empty) - wake_up(&blkif->pending_free_wq); + wake_up(&ring->pending_free_wq); } /* @@ -556,10 +569,10 @@ abort: /* * Notification from the guest OS. */ -static void blkif_notify_work(struct xen_blkif *blkif) +static void blkif_notify_work(struct xen_blkif_ring *ring) { - blkif->waiting_reqs = 1; - wake_up(&blkif->wq); + ring->waiting_reqs = 1; + wake_up(&ring->wq); } irqreturn_t xen_blkif_be_int(int irq, void *dev_id) @@ -572,31 +585,33 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) * SCHEDULER FUNCTIONS */ -static void print_stats(struct xen_blkif *blkif) +static void print_stats(struct xen_blkif_ring *ring) { pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" " | ds %4llu | pg: %4u/%4d\n", - current->comm, blkif->st_oo_req, - blkif->st_rd_req, blkif->st_wr_req, - blkif->st_f_req, blkif->st_ds_req, - blkif->persistent_gnt_c, + current->comm, ring->st_oo_req, + ring->st_rd_req, ring->st_wr_req, + ring->st_f_req, ring->st_ds_req, + ring->persistent_gnt_c, xen_blkif_max_pgrants); - blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); - blkif->st_rd_req = 0; - blkif->st_wr_req = 0; - blkif->st_oo_req = 0; - blkif->st_ds_req = 0; + ring->st_print = jiffies + msecs_to_jiffies(10 * 1000); + ring->st_rd_req = 0; + ring->st_wr_req = 0; + ring->st_oo_req = 0; + ring->st_ds_req = 0; } int xen_blkif_schedule(void *arg) { - struct xen_blkif *blkif = arg; + struct xen_blkif_ring *ring = arg; + struct xen_blkif *blkif = ring->blkif; struct xen_vbd *vbd = &blkif->vbd; unsigned long timeout; int ret; xen_blkif_get(blkif); + set_freezable(); while (!kthread_should_stop()) { if (try_to_freeze()) continue; @@ -606,50 +621,50 @@ int xen_blkif_schedule(void *arg) timeout = msecs_to_jiffies(LRU_INTERVAL); timeout = wait_event_interruptible_timeout( - blkif->wq, - blkif->waiting_reqs || kthread_should_stop(), + ring->wq, + ring->waiting_reqs || kthread_should_stop(), timeout); if (timeout == 0) goto purge_gnt_list; timeout = wait_event_interruptible_timeout( - blkif->pending_free_wq, - !list_empty(&blkif->pending_free) || + ring->pending_free_wq, + !list_empty(&ring->pending_free) || kthread_should_stop(), timeout); if (timeout == 0) goto purge_gnt_list; - blkif->waiting_reqs = 0; + ring->waiting_reqs = 0; smp_mb(); /* clear flag *before* checking for work */ - ret = do_block_io_op(blkif); + ret = do_block_io_op(ring); if (ret > 0) - blkif->waiting_reqs = 1; + ring->waiting_reqs = 1; if (ret == -EACCES) - wait_event_interruptible(blkif->shutdown_wq, + wait_event_interruptible(ring->shutdown_wq, kthread_should_stop()); purge_gnt_list: if (blkif->vbd.feature_gnt_persistent && - time_after(jiffies, blkif->next_lru)) { - purge_persistent_gnt(blkif); - blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); + time_after(jiffies, ring->next_lru)) { + purge_persistent_gnt(ring); + ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); } /* Shrink if we have more than xen_blkif_max_buffer_pages */ - shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); + shrink_free_pagepool(ring, xen_blkif_max_buffer_pages); - if (log_stats && time_after(jiffies, blkif->st_print)) - print_stats(blkif); + if (log_stats && time_after(jiffies, ring->st_print)) + print_stats(ring); } /* Drain pending purge work */ - flush_work(&blkif->persistent_purge_work); + flush_work(&ring->persistent_purge_work); if (log_stats) - print_stats(blkif); + print_stats(ring); - blkif->xenblkd = NULL; + ring->xenblkd = NULL; xen_blkif_put(blkif); return 0; @@ -658,22 +673,22 @@ purge_gnt_list: /* * Remove persistent grants and empty the pool of free pages */ -void xen_blkbk_free_caches(struct xen_blkif *blkif) +void xen_blkbk_free_caches(struct xen_blkif_ring *ring) { /* Free all persistent grant pages */ - if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) - free_persistent_gnts(blkif, &blkif->persistent_gnts, - blkif->persistent_gnt_c); + if (!RB_EMPTY_ROOT(&ring->persistent_gnts)) + free_persistent_gnts(ring, &ring->persistent_gnts, + ring->persistent_gnt_c); - BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); - blkif->persistent_gnt_c = 0; + BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts)); + ring->persistent_gnt_c = 0; /* Since we are shutting down remove all pages from the buffer */ - shrink_free_pagepool(blkif, 0 /* All */); + shrink_free_pagepool(ring, 0 /* All */); } static unsigned int xen_blkbk_unmap_prepare( - struct xen_blkif *blkif, + struct xen_blkif_ring *ring, struct grant_page **pages, unsigned int num, struct gnttab_unmap_grant_ref *unmap_ops, @@ -683,7 +698,7 @@ static unsigned int xen_blkbk_unmap_prepare( for (i = 0; i < num; i++) { if (pages[i]->persistent_gnt != NULL) { - put_persistent_gnt(blkif, pages[i]->persistent_gnt); + put_persistent_gnt(ring, pages[i]->persistent_gnt); continue; } if (pages[i]->handle == BLKBACK_INVALID_HANDLE) @@ -700,17 +715,18 @@ static unsigned int xen_blkbk_unmap_prepare( static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data) { - struct pending_req* pending_req = (struct pending_req*) (data->data); - struct xen_blkif *blkif = pending_req->blkif; + struct pending_req *pending_req = (struct pending_req *)(data->data); + struct xen_blkif_ring *ring = pending_req->ring; + struct xen_blkif *blkif = ring->blkif; /* BUG_ON used to reproduce existing behaviour, but is this the best way to deal with this? */ BUG_ON(result); - put_free_pages(blkif, data->pages, data->count); - make_response(blkif, pending_req->id, + put_free_pages(ring, data->pages, data->count); + make_response(ring, pending_req->id, pending_req->operation, pending_req->status); - free_req(blkif, pending_req); + free_req(ring, pending_req); /* * Make sure the request is freed before releasing blkif, * or there could be a race between free_req and the @@ -723,7 +739,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ * pending_free_wq if there's a drain going on, but it has * to be taken into account if the current model is changed. */ - if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) { + if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) { complete(&blkif->drain_complete); } xen_blkif_put(blkif); @@ -732,11 +748,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ static void xen_blkbk_unmap_and_respond(struct pending_req *req) { struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data; - struct xen_blkif *blkif = req->blkif; + struct xen_blkif_ring *ring = req->ring; struct grant_page **pages = req->segments; unsigned int invcount; - invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs, + invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs, req->unmap, req->unmap_pages); work->data = req; @@ -757,7 +773,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req) * of hypercalls, but since this is only used in error paths there's * no real need. */ -static void xen_blkbk_unmap(struct xen_blkif *blkif, +static void xen_blkbk_unmap(struct xen_blkif_ring *ring, struct grant_page *pages[], int num) { @@ -768,20 +784,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif, while (num) { unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST); - - invcount = xen_blkbk_unmap_prepare(blkif, pages, batch, + + invcount = xen_blkbk_unmap_prepare(ring, pages, batch, unmap, unmap_pages); if (invcount) { ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); BUG_ON(ret); - put_free_pages(blkif, unmap_pages, invcount); + put_free_pages(ring, unmap_pages, invcount); } pages += batch; num -= batch; } } -static int xen_blkbk_map(struct xen_blkif *blkif, +static int xen_blkbk_map(struct xen_blkif_ring *ring, struct grant_page *pages[], int num, bool ro) { @@ -794,6 +810,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif, int ret = 0; int last_map = 0, map_until = 0; int use_persistent_gnts; + struct xen_blkif *blkif = ring->blkif; use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); @@ -806,10 +823,11 @@ again: for (i = map_until; i < num; i++) { uint32_t flags; - if (use_persistent_gnts) + if (use_persistent_gnts) { persistent_gnt = get_persistent_gnt( - blkif, + ring, pages[i]->gref); + } if (persistent_gnt) { /* @@ -819,7 +837,7 @@ again: pages[i]->page = persistent_gnt->page; pages[i]->persistent_gnt = persistent_gnt; } else { - if (get_free_page(blkif, &pages[i]->page)) + if (get_free_page(ring, &pages[i]->page)) goto out_of_memory; addr = vaddr(pages[i]->page); pages_to_gnt[segs_to_map] = pages[i]->page; @@ -852,7 +870,7 @@ again: BUG_ON(new_map_idx >= segs_to_map); if (unlikely(map[new_map_idx].status != 0)) { pr_debug("invalid buffer -- could not remap it\n"); - put_free_pages(blkif, &pages[seg_idx]->page, 1); + put_free_pages(ring, &pages[seg_idx]->page, 1); pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; ret |= 1; goto next; @@ -862,7 +880,7 @@ again: continue; } if (use_persistent_gnts && - blkif->persistent_gnt_c < xen_blkif_max_pgrants) { + ring->persistent_gnt_c < xen_blkif_max_pgrants) { /* * We are using persistent grants, the grant is * not mapped but we might have room for it. @@ -880,7 +898,7 @@ again: persistent_gnt->gnt = map[new_map_idx].ref; persistent_gnt->handle = map[new_map_idx].handle; persistent_gnt->page = pages[seg_idx]->page; - if (add_persistent_gnt(blkif, + if (add_persistent_gnt(ring, persistent_gnt)) { kfree(persistent_gnt); persistent_gnt = NULL; @@ -888,7 +906,7 @@ again: } pages[seg_idx]->persistent_gnt = persistent_gnt; pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", - persistent_gnt->gnt, blkif->persistent_gnt_c, + persistent_gnt->gnt, ring->persistent_gnt_c, xen_blkif_max_pgrants); goto next; } @@ -913,7 +931,7 @@ next: out_of_memory: pr_alert("%s: out of memory\n", __func__); - put_free_pages(blkif, pages_to_gnt, segs_to_map); + put_free_pages(ring, pages_to_gnt, segs_to_map); return -ENOMEM; } @@ -921,7 +939,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req) { int rc; - rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, + rc = xen_blkbk_map(pending_req->ring, pending_req->segments, pending_req->nr_segs, (pending_req->operation != BLKIF_OP_READ)); @@ -934,7 +952,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, struct phys_req *preq) { struct grant_page **pages = pending_req->indirect_pages; - struct xen_blkif *blkif = pending_req->blkif; + struct xen_blkif_ring *ring = pending_req->ring; int indirect_grefs, rc, n, nseg, i; struct blkif_request_segment *segments = NULL; @@ -945,7 +963,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, for (i = 0; i < indirect_grefs; i++) pages[i]->gref = req->u.indirect.indirect_grefs[i]; - rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); + rc = xen_blkbk_map(ring, pages, indirect_grefs, true); if (rc) goto unmap; @@ -977,15 +995,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, unmap: if (segments) kunmap_atomic(segments); - xen_blkbk_unmap(blkif, pages, indirect_grefs); + xen_blkbk_unmap(ring, pages, indirect_grefs); return rc; } -static int dispatch_discard_io(struct xen_blkif *blkif, +static int dispatch_discard_io(struct xen_blkif_ring *ring, struct blkif_request *req) { int err = 0; int status = BLKIF_RSP_OKAY; + struct xen_blkif *blkif = ring->blkif; struct block_device *bdev = blkif->vbd.bdev; unsigned long secure; struct phys_req preq; @@ -1002,7 +1021,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif, preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); goto fail_response; } - blkif->st_ds_req++; + ring->st_ds_req++; secure = (blkif->vbd.discard_secure && (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? @@ -1018,26 +1037,28 @@ fail_response: } else if (err) status = BLKIF_RSP_ERROR; - make_response(blkif, req->u.discard.id, req->operation, status); + make_response(ring, req->u.discard.id, req->operation, status); xen_blkif_put(blkif); return err; } -static int dispatch_other_io(struct xen_blkif *blkif, +static int dispatch_other_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req) { - free_req(blkif, pending_req); - make_response(blkif, req->u.other.id, req->operation, + free_req(ring, pending_req); + make_response(ring, req->u.other.id, req->operation, BLKIF_RSP_EOPNOTSUPP); return -EIO; } -static void xen_blk_drain_io(struct xen_blkif *blkif) +static void xen_blk_drain_io(struct xen_blkif_ring *ring) { + struct xen_blkif *blkif = ring->blkif; + atomic_set(&blkif->drain, 1); do { - if (atomic_read(&blkif->inflight) == 0) + if (atomic_read(&ring->inflight) == 0) break; wait_for_completion_interruptible_timeout( &blkif->drain_complete, HZ); @@ -1058,12 +1079,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && (error == -EOPNOTSUPP)) { pr_debug("flush diskcache op failed, not supported\n"); - xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); + xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && (error == -EOPNOTSUPP)) { pr_debug("write barrier op failed, not supported\n"); - xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); + xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; } else if (error) { pr_debug("Buffer not up-to-date at end of operation," @@ -1097,9 +1118,9 @@ static void end_block_io_op(struct bio *bio) * and transmute it to the block API to hand it over to the proper block disk. */ static int -__do_block_io_op(struct xen_blkif *blkif) +__do_block_io_op(struct xen_blkif_ring *ring) { - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings = &ring->blk_rings; struct blkif_request req; struct pending_req *pending_req; RING_IDX rc, rp; @@ -1112,7 +1133,7 @@ __do_block_io_op(struct xen_blkif *blkif) if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { rc = blk_rings->common.rsp_prod_pvt; pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", - rp, rc, rp - rc, blkif->vbd.pdevice); + rp, rc, rp - rc, ring->blkif->vbd.pdevice); return -EACCES; } while (rc != rp) { @@ -1125,14 +1146,14 @@ __do_block_io_op(struct xen_blkif *blkif) break; } - pending_req = alloc_req(blkif); + pending_req = alloc_req(ring); if (NULL == pending_req) { - blkif->st_oo_req++; + ring->st_oo_req++; more_to_do = 1; break; } - switch (blkif->blk_protocol) { + switch (ring->blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); break; @@ -1156,16 +1177,16 @@ __do_block_io_op(struct xen_blkif *blkif) case BLKIF_OP_WRITE_BARRIER: case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_INDIRECT: - if (dispatch_rw_block_io(blkif, &req, pending_req)) + if (dispatch_rw_block_io(ring, &req, pending_req)) goto done; break; case BLKIF_OP_DISCARD: - free_req(blkif, pending_req); - if (dispatch_discard_io(blkif, &req)) + free_req(ring, pending_req); + if (dispatch_discard_io(ring, &req)) goto done; break; default: - if (dispatch_other_io(blkif, &req, pending_req)) + if (dispatch_other_io(ring, &req, pending_req)) goto done; break; } @@ -1178,13 +1199,13 @@ done: } static int -do_block_io_op(struct xen_blkif *blkif) +do_block_io_op(struct xen_blkif_ring *ring) { - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings = &ring->blk_rings; int more_to_do; do { - more_to_do = __do_block_io_op(blkif); + more_to_do = __do_block_io_op(ring); if (more_to_do) break; @@ -1197,7 +1218,7 @@ do_block_io_op(struct xen_blkif *blkif) * Transmutation of the 'struct blkif_request' to a proper 'struct bio' * and call the 'submit_bio' to pass it to the underlying storage. */ -static int dispatch_rw_block_io(struct xen_blkif *blkif, +static int dispatch_rw_block_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req) { @@ -1225,17 +1246,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, switch (req_operation) { case BLKIF_OP_READ: - blkif->st_rd_req++; + ring->st_rd_req++; operation = READ; break; case BLKIF_OP_WRITE: - blkif->st_wr_req++; + ring->st_wr_req++; operation = WRITE_ODIRECT; break; case BLKIF_OP_WRITE_BARRIER: drain = true; case BLKIF_OP_FLUSH_DISKCACHE: - blkif->st_f_req++; + ring->st_f_req++; operation = WRITE_FLUSH; break; default: @@ -1260,7 +1281,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, preq.nr_sects = 0; - pending_req->blkif = blkif; + pending_req->ring = ring; pending_req->id = req->u.rw.id; pending_req->operation = req_operation; pending_req->status = BLKIF_RSP_OKAY; @@ -1287,12 +1308,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, goto fail_response; } - if (xen_vbd_translate(&preq, blkif, operation) != 0) { + if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) { pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n", operation == READ ? "read" : "write", preq.sector_number, preq.sector_number + preq.nr_sects, - blkif->vbd.pdevice); + ring->blkif->vbd.pdevice); goto fail_response; } @@ -1304,7 +1325,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, if (((int)preq.sector_number|(int)seg[i].nsec) & ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { pr_debug("Misaligned I/O request from domain %d\n", - blkif->domid); + ring->blkif->domid); goto fail_response; } } @@ -1313,7 +1334,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * issue the WRITE_FLUSH. */ if (drain) - xen_blk_drain_io(pending_req->blkif); + xen_blk_drain_io(pending_req->ring); /* * If we have failed at this point, we need to undo the M2P override, @@ -1328,8 +1349,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * This corresponding xen_blkif_put is done in __end_block_io_op, or * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. */ - xen_blkif_get(blkif); - atomic_inc(&blkif->inflight); + xen_blkif_get(ring->blkif); + atomic_inc(&ring->inflight); for (i = 0; i < nseg; i++) { while ((bio == NULL) || @@ -1377,19 +1398,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, blk_finish_plug(&plug); if (operation == READ) - blkif->st_rd_sect += preq.nr_sects; + ring->st_rd_sect += preq.nr_sects; else if (operation & WRITE) - blkif->st_wr_sect += preq.nr_sects; + ring->st_wr_sect += preq.nr_sects; return 0; fail_flush: - xen_blkbk_unmap(blkif, pending_req->segments, + xen_blkbk_unmap(ring, pending_req->segments, pending_req->nr_segs); fail_response: /* Haven't submitted any bio's yet. */ - make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); - free_req(blkif, pending_req); + make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); + free_req(ring, pending_req); msleep(1); /* back off a bit */ return -EIO; @@ -1407,21 +1428,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, /* * Put a response on the ring on how the operation fared. */ -static void make_response(struct xen_blkif *blkif, u64 id, +static void make_response(struct xen_blkif_ring *ring, u64 id, unsigned short op, int st) { struct blkif_response resp; unsigned long flags; - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings; int notify; resp.id = id; resp.operation = op; resp.status = st; - spin_lock_irqsave(&blkif->blk_ring_lock, flags); + spin_lock_irqsave(&ring->blk_ring_lock, flags); + blk_rings = &ring->blk_rings; /* Place on the response ring for the relevant domain. */ - switch (blkif->blk_protocol) { + switch (ring->blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), &resp, sizeof(resp)); @@ -1439,9 +1461,9 @@ static void make_response(struct xen_blkif *blkif, u64 id, } blk_rings->common.rsp_prod_pvt++; RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); - spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + spin_unlock_irqrestore(&ring->blk_ring_lock, flags); if (notify) - notify_remote_via_irq(blkif->irq); + notify_remote_via_irq(ring->irq); } static int __init xen_blkif_init(void) @@ -1457,6 +1479,9 @@ static int __init xen_blkif_init(void) xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; } + if (xenblk_max_queues == 0) + xenblk_max_queues = num_online_cpus(); + rc = xen_blkif_interface_init(); if (rc) goto failed_init; diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index c929ae22764c..dea61f6ab8cb 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -46,6 +46,7 @@ #include <xen/interface/io/protocols.h> extern unsigned int xen_blkif_max_ring_order; +extern unsigned int xenblk_max_queues; /* * This is the maximum number of segments that would be allowed in indirect * requests. This value will also be passed to the frontend. @@ -269,68 +270,79 @@ struct persistent_gnt { struct list_head remove_node; }; -struct xen_blkif { - /* Unique identifier for this interface. */ - domid_t domid; - unsigned int handle; +/* Per-ring information. */ +struct xen_blkif_ring { /* Physical parameters of the comms window. */ unsigned int irq; - /* Comms information. */ - enum blkif_protocol blk_protocol; union blkif_back_rings blk_rings; void *blk_ring; - /* The VBD attached to this interface. */ - struct xen_vbd vbd; - /* Back pointer to the backend_info. */ - struct backend_info *be; /* Private fields. */ spinlock_t blk_ring_lock; - atomic_t refcnt; wait_queue_head_t wq; - /* for barrier (drain) requests */ - struct completion drain_complete; - atomic_t drain; atomic_t inflight; - /* One thread per one blkif. */ + /* One thread per blkif ring. */ struct task_struct *xenblkd; unsigned int waiting_reqs; - /* tree to store persistent grants */ + /* List of all 'pending_req' available */ + struct list_head pending_free; + /* And its spinlock. */ + spinlock_t pending_free_lock; + wait_queue_head_t pending_free_wq; + + /* Tree to store persistent grants. */ + spinlock_t pers_gnts_lock; struct rb_root persistent_gnts; unsigned int persistent_gnt_c; atomic_t persistent_gnt_in_use; unsigned long next_lru; - /* used by the kworker that offload work from the persistent purge */ + /* Statistics. */ + unsigned long st_print; + unsigned long long st_rd_req; + unsigned long long st_wr_req; + unsigned long long st_oo_req; + unsigned long long st_f_req; + unsigned long long st_ds_req; + unsigned long long st_rd_sect; + unsigned long long st_wr_sect; + + /* Used by the kworker that offload work from the persistent purge. */ struct list_head persistent_purge_list; struct work_struct persistent_purge_work; - /* buffer of free pages to map grant refs */ + /* Buffer of free pages to map grant refs. */ spinlock_t free_pages_lock; int free_pages_num; struct list_head free_pages; - /* List of all 'pending_req' available */ - struct list_head pending_free; - /* And its spinlock. */ - spinlock_t pending_free_lock; - wait_queue_head_t pending_free_wq; - - /* statistics */ - unsigned long st_print; - unsigned long long st_rd_req; - unsigned long long st_wr_req; - unsigned long long st_oo_req; - unsigned long long st_f_req; - unsigned long long st_ds_req; - unsigned long long st_rd_sect; - unsigned long long st_wr_sect; - struct work_struct free_work; /* Thread shutdown wait queue. */ wait_queue_head_t shutdown_wq; - unsigned int nr_ring_pages; + struct xen_blkif *blkif; +}; + +struct xen_blkif { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Comms information. */ + enum blkif_protocol blk_protocol; + /* The VBD attached to this interface. */ + struct xen_vbd vbd; + /* Back pointer to the backend_info. */ + struct backend_info *be; + atomic_t refcnt; + /* for barrier (drain) requests */ + struct completion drain_complete; + atomic_t drain; + + struct work_struct free_work; + unsigned int nr_ring_pages; + /* All rings for this device. */ + struct xen_blkif_ring *rings; + unsigned int nr_rings; }; struct seg_buf { @@ -352,7 +364,7 @@ struct grant_page { * response queued for it, with the saved 'id' passed back. */ struct pending_req { - struct xen_blkif *blkif; + struct xen_blkif_ring *ring; u64 id; int nr_segs; atomic_t pendcnt; @@ -394,7 +406,7 @@ int xen_blkif_xenbus_init(void); irqreturn_t xen_blkif_be_int(int irq, void *dev_id); int xen_blkif_schedule(void *arg); int xen_blkif_purge_persistent(void *arg); -void xen_blkbk_free_caches(struct xen_blkif *blkif); +void xen_blkbk_free_caches(struct xen_blkif_ring *ring); int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, struct backend_info *be, int state); diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index f53cff42f8da..876763f7f13e 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) { int err; char name[BLKBACK_NAME_LEN]; + struct xen_blkif_ring *ring; + int i; /* Not ready to connect? */ - if (!blkif->irq || !blkif->vbd.bdev) + if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev) return; /* Already connected? */ @@ -113,13 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) } invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); - blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name); - if (IS_ERR(blkif->xenblkd)) { - err = PTR_ERR(blkif->xenblkd); - blkif->xenblkd = NULL; - xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); - return; + for (i = 0; i < blkif->nr_rings; i++) { + ring = &blkif->rings[i]; + ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i); + if (IS_ERR(ring->xenblkd)) { + err = PTR_ERR(ring->xenblkd); + ring->xenblkd = NULL; + xenbus_dev_fatal(blkif->be->dev, err, + "start %s-%d xenblkd", name, i); + goto out; + } + } + return; + +out: + while (--i >= 0) { + ring = &blkif->rings[i]; + kthread_stop(ring->xenblkd); + } + return; +} + +static int xen_blkif_alloc_rings(struct xen_blkif *blkif) +{ + unsigned int r; + + blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct xen_blkif_ring), GFP_KERNEL); + if (!blkif->rings) + return -ENOMEM; + + for (r = 0; r < blkif->nr_rings; r++) { + struct xen_blkif_ring *ring = &blkif->rings[r]; + + spin_lock_init(&ring->blk_ring_lock); + init_waitqueue_head(&ring->wq); + INIT_LIST_HEAD(&ring->pending_free); + INIT_LIST_HEAD(&ring->persistent_purge_list); + INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants); + spin_lock_init(&ring->free_pages_lock); + INIT_LIST_HEAD(&ring->free_pages); + + spin_lock_init(&ring->pending_free_lock); + init_waitqueue_head(&ring->pending_free_wq); + init_waitqueue_head(&ring->shutdown_wq); + ring->blkif = blkif; + ring->st_print = jiffies; + xen_blkif_get(blkif); } + + return 0; } static struct xen_blkif *xen_blkif_alloc(domid_t domid) @@ -133,41 +177,25 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) return ERR_PTR(-ENOMEM); blkif->domid = domid; - spin_lock_init(&blkif->blk_ring_lock); atomic_set(&blkif->refcnt, 1); - init_waitqueue_head(&blkif->wq); init_completion(&blkif->drain_complete); - atomic_set(&blkif->drain, 0); - blkif->st_print = jiffies; - blkif->persistent_gnts.rb_node = NULL; - spin_lock_init(&blkif->free_pages_lock); - INIT_LIST_HEAD(&blkif->free_pages); - INIT_LIST_HEAD(&blkif->persistent_purge_list); - blkif->free_pages_num = 0; - atomic_set(&blkif->persistent_gnt_in_use, 0); - atomic_set(&blkif->inflight, 0); - INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants); - - INIT_LIST_HEAD(&blkif->pending_free); INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); - spin_lock_init(&blkif->pending_free_lock); - init_waitqueue_head(&blkif->pending_free_wq); - init_waitqueue_head(&blkif->shutdown_wq); return blkif; } -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, +static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref, unsigned int nr_grefs, unsigned int evtchn) { int err; + struct xen_blkif *blkif = ring->blkif; /* Already connected through? */ - if (blkif->irq) + if (ring->irq) return 0; err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs, - &blkif->blk_ring); + &ring->blk_ring); if (err < 0) return err; @@ -175,24 +203,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, case BLKIF_PROTOCOL_NATIVE: { struct blkif_sring *sring; - sring = (struct blkif_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.native, sring, + sring = (struct blkif_sring *)ring->blk_ring; + BACK_RING_INIT(&ring->blk_rings.native, sring, XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_32: { struct blkif_x86_32_sring *sring_x86_32; - sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, + sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring; + BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32, XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_64: { struct blkif_x86_64_sring *sring_x86_64; - sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, + sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring; + BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64, XEN_PAGE_SIZE * nr_grefs); break; } @@ -202,13 +230,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn, xen_blkif_be_int, 0, - "blkif-backend", blkif); + "blkif-backend", ring); if (err < 0) { - xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); - blkif->blk_rings.common.sring = NULL; + xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring); + ring->blk_rings.common.sring = NULL; return err; } - blkif->irq = err; + ring->irq = err; return 0; } @@ -216,50 +244,69 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, static int xen_blkif_disconnect(struct xen_blkif *blkif) { struct pending_req *req, *n; - int i = 0, j; + unsigned int j, r; - if (blkif->xenblkd) { - kthread_stop(blkif->xenblkd); - wake_up(&blkif->shutdown_wq); - blkif->xenblkd = NULL; - } + for (r = 0; r < blkif->nr_rings; r++) { + struct xen_blkif_ring *ring = &blkif->rings[r]; + unsigned int i = 0; - /* The above kthread_stop() guarantees that at this point we - * don't have any discard_io or other_io requests. So, checking - * for inflight IO is enough. - */ - if (atomic_read(&blkif->inflight) > 0) - return -EBUSY; + if (ring->xenblkd) { + kthread_stop(ring->xenblkd); + wake_up(&ring->shutdown_wq); + ring->xenblkd = NULL; + } - if (blkif->irq) { - unbind_from_irqhandler(blkif->irq, blkif); - blkif->irq = 0; - } + /* The above kthread_stop() guarantees that at this point we + * don't have any discard_io or other_io requests. So, checking + * for inflight IO is enough. + */ + if (atomic_read(&ring->inflight) > 0) + return -EBUSY; - if (blkif->blk_rings.common.sring) { - xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); - blkif->blk_rings.common.sring = NULL; - } + if (ring->irq) { + unbind_from_irqhandler(ring->irq, ring); + ring->irq = 0; + } - /* Remove all persistent grants and the cache of ballooned pages. */ - xen_blkbk_free_caches(blkif); + if (ring->blk_rings.common.sring) { + xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring); + ring->blk_rings.common.sring = NULL; + } - /* Check that there is no request in use */ - list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { - list_del(&req->free_list); + /* Remove all persistent grants and the cache of ballooned pages. */ + xen_blkbk_free_caches(ring); - for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) - kfree(req->segments[j]); + /* Check that there is no request in use */ + list_for_each_entry_safe(req, n, &ring->pending_free, free_list) { + list_del(&req->free_list); - for (j = 0; j < MAX_INDIRECT_PAGES; j++) - kfree(req->indirect_pages[j]); + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) + kfree(req->segments[j]); - kfree(req); - i++; - } + for (j = 0; j < MAX_INDIRECT_PAGES; j++) + kfree(req->indirect_pages[j]); + + kfree(req); + i++; + } - WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); + BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0); + BUG_ON(!list_empty(&ring->persistent_purge_list)); + BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts)); + BUG_ON(!list_empty(&ring->free_pages)); + BUG_ON(ring->free_pages_num != 0); + BUG_ON(ring->persistent_gnt_c != 0); + WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); + xen_blkif_put(blkif); + } blkif->nr_ring_pages = 0; + /* + * blkif->rings was allocated in connect_ring, so we should free it in + * here. + */ + kfree(blkif->rings); + blkif->rings = NULL; + blkif->nr_rings = 0; return 0; } @@ -271,13 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif) xen_vbd_free(&blkif->vbd); /* Make sure everything is drained before shutting down */ - BUG_ON(blkif->persistent_gnt_c != 0); - BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0); - BUG_ON(blkif->free_pages_num != 0); - BUG_ON(!list_empty(&blkif->persistent_purge_list)); - BUG_ON(!list_empty(&blkif->free_pages)); - BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); - kmem_cache_free(xen_blkif_cachep, blkif); } @@ -296,25 +336,38 @@ int __init xen_blkif_interface_init(void) * sysfs interface for VBD I/O requests */ -#define VBD_SHOW(name, format, args...) \ +#define VBD_SHOW_ALLRING(name, format) \ static ssize_t show_##name(struct device *_dev, \ struct device_attribute *attr, \ char *buf) \ { \ struct xenbus_device *dev = to_xenbus_device(_dev); \ struct backend_info *be = dev_get_drvdata(&dev->dev); \ + struct xen_blkif *blkif = be->blkif; \ + unsigned int i; \ + unsigned long long result = 0; \ \ - return sprintf(buf, format, ##args); \ + if (!blkif->rings) \ + goto out; \ + \ + for (i = 0; i < blkif->nr_rings; i++) { \ + struct xen_blkif_ring *ring = &blkif->rings[i]; \ + \ + result += ring->st_##name; \ + } \ + \ +out: \ + return sprintf(buf, format, result); \ } \ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) -VBD_SHOW(oo_req, "%llu\n", be->blkif->st_oo_req); -VBD_SHOW(rd_req, "%llu\n", be->blkif->st_rd_req); -VBD_SHOW(wr_req, "%llu\n", be->blkif->st_wr_req); -VBD_SHOW(f_req, "%llu\n", be->blkif->st_f_req); -VBD_SHOW(ds_req, "%llu\n", be->blkif->st_ds_req); -VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect); -VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect); +VBD_SHOW_ALLRING(oo_req, "%llu\n"); +VBD_SHOW_ALLRING(rd_req, "%llu\n"); +VBD_SHOW_ALLRING(wr_req, "%llu\n"); +VBD_SHOW_ALLRING(f_req, "%llu\n"); +VBD_SHOW_ALLRING(ds_req, "%llu\n"); +VBD_SHOW_ALLRING(rd_sect, "%llu\n"); +VBD_SHOW_ALLRING(wr_sect, "%llu\n"); static struct attribute *xen_vbdstat_attrs[] = { &dev_attr_oo_req.attr, @@ -332,6 +385,18 @@ static struct attribute_group xen_vbdstat_group = { .attrs = xen_vbdstat_attrs, }; +#define VBD_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + struct xenbus_device *dev = to_xenbus_device(_dev); \ + struct backend_info *be = dev_get_drvdata(&dev->dev); \ + \ + return sprintf(buf, format, ##args); \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); VBD_SHOW(mode, "%s\n", be->mode); @@ -440,11 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev) dev_set_drvdata(&dev->dev, NULL); - if (be->blkif) { + if (be->blkif) xen_blkif_disconnect(be->blkif); - xen_blkif_put(be->blkif); - } + /* Put the reference we set in xen_blkif_alloc(). */ + xen_blkif_put(be->blkif); kfree(be->mode); kfree(be); return 0; @@ -553,6 +618,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev, goto fail; } + /* Multi-queue: advertise how many queues are supported by us.*/ + err = xenbus_printf(XBT_NIL, dev->nodename, + "multi-queue-max-queues", "%u", xenblk_max_queues); + if (err) + pr_warn("Error writing multi-queue-max-queues\n"); + /* setup back pointer */ be->blkif->be = be; @@ -708,8 +779,14 @@ static void frontend_changed(struct xenbus_device *dev, } err = connect_ring(be); - if (err) + if (err) { + /* + * Clean up so that memory resources can be used by + * other devices. connect_ring reported already error. + */ + xen_blkif_disconnect(be->blkif); break; + } xen_update_blkif_status(be->blkif); break; @@ -825,50 +902,43 @@ again: xenbus_transaction_end(xbt, 1); } - -static int connect_ring(struct backend_info *be) +/* + * Each ring may have multi pages, depends on "ring-page-order". + */ +static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir) { - struct xenbus_device *dev = be->dev; unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; - unsigned int evtchn, nr_grefs, ring_page_order; - unsigned int pers_grants; - char protocol[64] = ""; struct pending_req *req, *n; int err, i, j; + struct xen_blkif *blkif = ring->blkif; + struct xenbus_device *dev = blkif->be->dev; + unsigned int ring_page_order, nr_grefs, evtchn; - pr_debug("%s %s\n", __func__, dev->otherend); - - err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u", + err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u", &evtchn); if (err != 1) { err = -EINVAL; - xenbus_dev_fatal(dev, err, "reading %s/event-channel", - dev->otherend); + xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir); return err; } - pr_info("event-channel %u\n", evtchn); err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", &ring_page_order); if (err != 1) { - err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", - "%u", &ring_ref[0]); + err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]); if (err != 1) { err = -EINVAL; - xenbus_dev_fatal(dev, err, "reading %s/ring-ref", - dev->otherend); + xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir); return err; } nr_grefs = 1; - pr_info("%s:using single page: ring-ref %d\n", dev->otherend, - ring_ref[0]); } else { unsigned int i; if (ring_page_order > xen_blkif_max_ring_order) { err = -EINVAL; xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", - dev->otherend, ring_page_order, + dir, ring_page_order, xen_blkif_max_ring_order); return err; } @@ -878,52 +948,23 @@ static int connect_ring(struct backend_info *be) char ring_ref_name[RINGREF_NAME_LEN]; snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name, + err = xenbus_scanf(XBT_NIL, dir, ring_ref_name, "%u", &ring_ref[i]); if (err != 1) { err = -EINVAL; xenbus_dev_fatal(dev, err, "reading %s/%s", - dev->otherend, ring_ref_name); + dir, ring_ref_name); return err; } - pr_info("ring-ref%u: %u\n", i, ring_ref[i]); } } - - be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; - err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", - "%63s", protocol, NULL); - if (err) - strcpy(protocol, "unspecified, assuming default"); - else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; - else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; - else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; - else { - xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); - return -1; - } - err = xenbus_gather(XBT_NIL, dev->otherend, - "feature-persistent", "%u", - &pers_grants, NULL); - if (err) - pers_grants = 0; - - be->blkif->vbd.feature_gnt_persistent = pers_grants; - be->blkif->vbd.overflow_max_grants = 0; - be->blkif->nr_ring_pages = nr_grefs; - - pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n", - nr_grefs, evtchn, be->blkif->blk_protocol, protocol, - pers_grants ? "persistent grants" : ""); + blkif->nr_ring_pages = nr_grefs; for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) goto fail; - list_add_tail(&req->free_list, &be->blkif->pending_free); + list_add_tail(&req->free_list, &ring->pending_free); for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL); if (!req->segments[j]) @@ -938,7 +979,7 @@ static int connect_ring(struct backend_info *be) } /* Map the shared frame, irq etc. */ - err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn); + err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn); if (err) { xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn); return err; @@ -947,7 +988,7 @@ static int connect_ring(struct backend_info *be) return 0; fail: - list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) { + list_for_each_entry_safe(req, n, &ring->pending_free, free_list) { list_del(&req->free_list); for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { if (!req->segments[j]) @@ -962,6 +1003,93 @@ fail: kfree(req); } return -ENOMEM; + +} + +static int connect_ring(struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + unsigned int pers_grants; + char protocol[64] = ""; + int err, i; + char *xspath; + size_t xspathsize; + const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */ + unsigned int requested_num_queues = 0; + + pr_debug("%s %s\n", __func__, dev->otherend); + + be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; + err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", + "%63s", protocol, NULL); + if (err) + strcpy(protocol, "unspecified, assuming default"); + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; + else { + xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); + return -ENOSYS; + } + err = xenbus_gather(XBT_NIL, dev->otherend, + "feature-persistent", "%u", + &pers_grants, NULL); + if (err) + pers_grants = 0; + + be->blkif->vbd.feature_gnt_persistent = pers_grants; + be->blkif->vbd.overflow_max_grants = 0; + + /* + * Read the number of hardware queues from frontend. + */ + err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues", + "%u", &requested_num_queues); + if (err < 0) { + requested_num_queues = 1; + } else { + if (requested_num_queues > xenblk_max_queues + || requested_num_queues == 0) { + /* Buggy or malicious guest. */ + xenbus_dev_fatal(dev, err, + "guest requested %u queues, exceeding the maximum of %u.", + requested_num_queues, xenblk_max_queues); + return -ENOSYS; + } + } + be->blkif->nr_rings = requested_num_queues; + if (xen_blkif_alloc_rings(be->blkif)) + return -ENOMEM; + + pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename, + be->blkif->nr_rings, be->blkif->blk_protocol, protocol, + pers_grants ? "persistent grants" : ""); + + if (be->blkif->nr_rings == 1) + return read_per_ring_refs(&be->blkif->rings[0], dev->otherend); + else { + xspathsize = strlen(dev->otherend) + xenstore_path_ext_size; + xspath = kmalloc(xspathsize, GFP_KERNEL); + if (!xspath) { + xenbus_dev_fatal(dev, -ENOMEM, "reading ring references"); + return -ENOMEM; + } + + for (i = 0; i < be->blkif->nr_rings; i++) { + memset(xspath, 0, xspathsize); + snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i); + err = read_per_ring_refs(&be->blkif->rings[i], xspath); + if (err) { + kfree(xspath); + return err; + } + } + kfree(xspath); + } + return 0; } static const struct xenbus_device_id xen_blkbk_ids[] = { diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2fee2eef988d..8a8dc91c39f7 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -60,6 +60,20 @@ #include <asm/xen/hypervisor.h> +/* + * The minimal size of segment supported by the block framework is PAGE_SIZE. + * When Linux is using a different page size than Xen, it may not be possible + * to put all the data in a single segment. + * This can happen when the backend doesn't support indirect descriptor and + * therefore the maximum amount of data that a request can carry is + * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB + * + * Note that we only support one extra request. So the Linux page size + * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) = + * 88KB. + */ +#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE) + enum blkif_state { BLKIF_STATE_DISCONNECTED, BLKIF_STATE_CONNECTED, @@ -72,6 +86,13 @@ struct grant { struct list_head node; }; +enum blk_req_status { + REQ_WAITING, + REQ_DONE, + REQ_ERROR, + REQ_EOPNOTSUPP, +}; + struct blk_shadow { struct blkif_request req; struct request *request; @@ -79,6 +100,14 @@ struct blk_shadow { struct grant **indirect_grants; struct scatterlist *sg; unsigned int num_sg; + enum blk_req_status status; + + #define NO_ASSOCIATED_ID ~0UL + /* + * Id of the sibling if we ever need 2 requests when handling a + * block I/O request + */ + unsigned long associated_id; }; struct split_bio { @@ -99,6 +128,10 @@ static unsigned int xen_blkif_max_segments = 32; module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); +static unsigned int xen_blkif_max_queues = 4; +module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO); +MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk"); + /* * Maximum order of pages to be used for the shared ring between front and * backend, 4KB page granularity is used. @@ -114,10 +147,35 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) /* - * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 - * characters are enough. Define to 20 to keep consist with backend. + * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 + * characters are enough. Define to 20 to keep consistent with backend. */ #define RINGREF_NAME_LEN (20) +/* + * queue-%u would take 7 + 10(UINT_MAX) = 17 characters. + */ +#define QUEUE_NAME_LEN (17) + +/* + * Per-ring info. + * Every blkfront device can associate with one or more blkfront_ring_info, + * depending on how many hardware queues/rings to be used. + */ +struct blkfront_ring_info { + /* Lock to protect data in every ring buffer. */ + spinlock_t ring_lock; + struct blkif_front_ring ring; + unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; + unsigned int evtchn, irq; + struct work_struct work; + struct gnttab_free_callback callback; + struct blk_shadow shadow[BLK_MAX_RING_SIZE]; + struct list_head indirect_pages; + struct list_head grants; + unsigned int persistent_gnts_c; + unsigned long shadow_free; + struct blkfront_info *dev_info; +}; /* * We have one of these per vbd, whether ide, scsi or 'other'. They @@ -126,25 +184,15 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the */ struct blkfront_info { - spinlock_t io_lock; struct mutex mutex; struct xenbus_device *xbdev; struct gendisk *gd; int vdevice; blkif_vdev_t handle; enum blkif_state connected; - int ring_ref[XENBUS_MAX_RING_GRANTS]; + /* Number of pages per ring buffer. */ unsigned int nr_ring_pages; - struct blkif_front_ring ring; - unsigned int evtchn, irq; struct request_queue *rq; - struct work_struct work; - struct gnttab_free_callback callback; - struct blk_shadow shadow[BLK_MAX_RING_SIZE]; - struct list_head grants; - struct list_head indirect_pages; - unsigned int persistent_gnts_c; - unsigned long shadow_free; unsigned int feature_flush; unsigned int feature_discard:1; unsigned int feature_secdiscard:1; @@ -155,6 +203,8 @@ struct blkfront_info unsigned int max_indirect_segments; int is_ready; struct blk_mq_tag_set tag_set; + struct blkfront_ring_info *rinfo; + unsigned int nr_rings; }; static unsigned int nr_minors; @@ -198,38 +248,40 @@ static DEFINE_SPINLOCK(minor_lock); #define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) -static int blkfront_setup_indirect(struct blkfront_info *info); -static int blkfront_gather_backend_features(struct blkfront_info *info); +static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); +static void blkfront_gather_backend_features(struct blkfront_info *info); -static int get_id_from_freelist(struct blkfront_info *info) +static int get_id_from_freelist(struct blkfront_ring_info *rinfo) { - unsigned long free = info->shadow_free; - BUG_ON(free >= BLK_RING_SIZE(info)); - info->shadow_free = info->shadow[free].req.u.rw.id; - info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ + unsigned long free = rinfo->shadow_free; + + BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info)); + rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id; + rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ return free; } -static int add_id_to_freelist(struct blkfront_info *info, - unsigned long id) +static int add_id_to_freelist(struct blkfront_ring_info *rinfo, + unsigned long id) { - if (info->shadow[id].req.u.rw.id != id) + if (rinfo->shadow[id].req.u.rw.id != id) return -EINVAL; - if (info->shadow[id].request == NULL) + if (rinfo->shadow[id].request == NULL) return -EINVAL; - info->shadow[id].req.u.rw.id = info->shadow_free; - info->shadow[id].request = NULL; - info->shadow_free = id; + rinfo->shadow[id].req.u.rw.id = rinfo->shadow_free; + rinfo->shadow[id].request = NULL; + rinfo->shadow_free = id; return 0; } -static int fill_grant_buffer(struct blkfront_info *info, int num) +static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) { + struct blkfront_info *info = rinfo->dev_info; struct page *granted_page; struct grant *gnt_list_entry, *n; int i = 0; - while(i < num) { + while (i < num) { gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); if (!gnt_list_entry) goto out_of_memory; @@ -244,7 +296,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) } gnt_list_entry->gref = GRANT_INVALID_REF; - list_add(&gnt_list_entry->node, &info->grants); + list_add(&gnt_list_entry->node, &rinfo->grants); i++; } @@ -252,7 +304,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) out_of_memory: list_for_each_entry_safe(gnt_list_entry, n, - &info->grants, node) { + &rinfo->grants, node) { list_del(&gnt_list_entry->node); if (info->feature_persistent) __free_page(gnt_list_entry->page); @@ -263,17 +315,17 @@ out_of_memory: return -ENOMEM; } -static struct grant *get_free_grant(struct blkfront_info *info) +static struct grant *get_free_grant(struct blkfront_ring_info *rinfo) { struct grant *gnt_list_entry; - BUG_ON(list_empty(&info->grants)); - gnt_list_entry = list_first_entry(&info->grants, struct grant, + BUG_ON(list_empty(&rinfo->grants)); + gnt_list_entry = list_first_entry(&rinfo->grants, struct grant, node); list_del(&gnt_list_entry->node); if (gnt_list_entry->gref != GRANT_INVALID_REF) - info->persistent_gnts_c--; + rinfo->persistent_gnts_c--; return gnt_list_entry; } @@ -289,9 +341,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry, static struct grant *get_grant(grant_ref_t *gref_head, unsigned long gfn, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { - struct grant *gnt_list_entry = get_free_grant(info); + struct grant *gnt_list_entry = get_free_grant(rinfo); + struct blkfront_info *info = rinfo->dev_info; if (gnt_list_entry->gref != GRANT_INVALID_REF) return gnt_list_entry; @@ -312,9 +365,10 @@ static struct grant *get_grant(grant_ref_t *gref_head, } static struct grant *get_indirect_grant(grant_ref_t *gref_head, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { - struct grant *gnt_list_entry = get_free_grant(info); + struct grant *gnt_list_entry = get_free_grant(rinfo); + struct blkfront_info *info = rinfo->dev_info; if (gnt_list_entry->gref != GRANT_INVALID_REF) return gnt_list_entry; @@ -326,8 +380,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head, struct page *indirect_page; /* Fetch a pre-allocated page to use for indirect grefs */ - BUG_ON(list_empty(&info->indirect_pages)); - indirect_page = list_first_entry(&info->indirect_pages, + BUG_ON(list_empty(&rinfo->indirect_pages)); + indirect_page = list_first_entry(&rinfo->indirect_pages, struct page, lru); list_del(&indirect_page->lru); gnt_list_entry->page = indirect_page; @@ -403,8 +457,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr) static void blkif_restart_queue_callback(void *arg) { - struct blkfront_info *info = (struct blkfront_info *)arg; - schedule_work(&info->work); + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg; + schedule_work(&rinfo->work); } static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) @@ -456,16 +510,33 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, return 0; } -static int blkif_queue_discard_req(struct request *req) +static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo, + struct request *req, + struct blkif_request **ring_req) { - struct blkfront_info *info = req->rq_disk->private_data; + unsigned long id; + + *ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt); + rinfo->ring.req_prod_pvt++; + + id = get_id_from_freelist(rinfo); + rinfo->shadow[id].request = req; + rinfo->shadow[id].status = REQ_WAITING; + rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID; + + (*ring_req)->u.rw.id = id; + + return id; +} + +static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo) +{ + struct blkfront_info *info = rinfo->dev_info; struct blkif_request *ring_req; unsigned long id; /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); - id = get_id_from_freelist(info); - info->shadow[id].request = req; + id = blkif_ring_get_request(rinfo, req, &ring_req); ring_req->operation = BLKIF_OP_DISCARD; ring_req->u.discard.nr_sectors = blk_rq_sectors(req); @@ -476,10 +547,8 @@ static int blkif_queue_discard_req(struct request *req) else ring_req->u.discard.flag = 0; - info->ring.req_prod_pvt++; - /* Keep a private copy so we can reissue requests when recovering. */ - info->shadow[id].req = *ring_req; + rinfo->shadow[id].req = *ring_req; return 0; } @@ -487,7 +556,7 @@ static int blkif_queue_discard_req(struct request *req) struct setup_rw_req { unsigned int grant_idx; struct blkif_request_segment *segments; - struct blkfront_info *info; + struct blkfront_ring_info *rinfo; struct blkif_request *ring_req; grant_ref_t gref_head; unsigned int id; @@ -495,6 +564,9 @@ struct setup_rw_req { bool need_copy; unsigned int bvec_off; char *bvec_data; + + bool require_extra_req; + struct blkif_request *extra_ring_req; }; static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, @@ -507,8 +579,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, /* Convenient aliases */ unsigned int grant_idx = setup->grant_idx; struct blkif_request *ring_req = setup->ring_req; - struct blkfront_info *info = setup->info; - struct blk_shadow *shadow = &info->shadow[setup->id]; + struct blkfront_ring_info *rinfo = setup->rinfo; + /* + * We always use the shadow of the first request to store the list + * of grant associated to the block I/O request. This made the + * completion more easy to handle even if the block I/O request is + * split. + */ + struct blk_shadow *shadow = &rinfo->shadow[setup->id]; + + if (unlikely(setup->require_extra_req && + grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + /* + * We are using the second request, setup grant_idx + * to be the index of the segment array. + */ + grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST; + ring_req = setup->extra_ring_req; + } if ((ring_req->operation == BLKIF_OP_INDIRECT) && (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { @@ -516,15 +604,19 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, kunmap_atomic(setup->segments); n = grant_idx / GRANTS_PER_INDIRECT_FRAME; - gnt_list_entry = get_indirect_grant(&setup->gref_head, info); + gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo); shadow->indirect_grants[n] = gnt_list_entry; setup->segments = kmap_atomic(gnt_list_entry->page); ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; } - gnt_list_entry = get_grant(&setup->gref_head, gfn, info); + gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo); ref = gnt_list_entry->gref; - shadow->grants_used[grant_idx] = gnt_list_entry; + /* + * All the grants are stored in the shadow of the first + * request. Therefore we have to use the global index. + */ + shadow->grants_used[setup->grant_idx] = gnt_list_entry; if (setup->need_copy) { void *shared_data; @@ -566,16 +658,36 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, (setup->grant_idx)++; } -static int blkif_queue_rw_req(struct request *req) +static void blkif_setup_extra_req(struct blkif_request *first, + struct blkif_request *second) { - struct blkfront_info *info = req->rq_disk->private_data; - struct blkif_request *ring_req; - unsigned long id; + uint16_t nr_segments = first->u.rw.nr_segments; + + /* + * The second request is only present when the first request uses + * all its segments. It's always the continuity of the first one. + */ + first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; + + second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST; + second->u.rw.sector_number = first->u.rw.sector_number + + (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512; + + second->u.rw.handle = first->u.rw.handle; + second->operation = first->operation; +} + +static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo) +{ + struct blkfront_info *info = rinfo->dev_info; + struct blkif_request *ring_req, *extra_ring_req = NULL; + unsigned long id, extra_id = NO_ASSOCIATED_ID; + bool require_extra_req = false; int i; struct setup_rw_req setup = { .grant_idx = 0, .segments = NULL, - .info = info, + .rinfo = rinfo, .need_copy = rq_data_dir(req) && info->feature_persistent, }; @@ -584,7 +696,6 @@ static int blkif_queue_rw_req(struct request *req) * existing persistent grants, or if we have to get new grants, * as there are not sufficiently many free. */ - bool new_persistent_gnts; struct scatterlist *sg; int num_sg, max_grefs, num_grant; @@ -596,41 +707,36 @@ static int blkif_queue_rw_req(struct request *req) */ max_grefs += INDIRECT_GREFS(max_grefs); - /* Check if we have enough grants to allocate a requests */ - if (info->persistent_gnts_c < max_grefs) { - new_persistent_gnts = 1; - if (gnttab_alloc_grant_references( - max_grefs - info->persistent_gnts_c, - &setup.gref_head) < 0) { + /* + * We have to reserve 'max_grefs' grants because persistent + * grants are shared by all rings. + */ + if (max_grefs > 0) + if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) { gnttab_request_free_callback( - &info->callback, + &rinfo->callback, blkif_restart_queue_callback, - info, + rinfo, max_grefs); return 1; } - } else - new_persistent_gnts = 0; /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); - id = get_id_from_freelist(info); - info->shadow[id].request = req; - - BUG_ON(info->max_indirect_segments == 0 && - GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST); - BUG_ON(info->max_indirect_segments && - GREFS(req->nr_phys_segments) > info->max_indirect_segments); + id = blkif_ring_get_request(rinfo, req, &ring_req); - num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg); num_grant = 0; /* Calculate the number of grant used */ - for_each_sg(info->shadow[id].sg, sg, num_sg, i) + for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) num_grant += gnttab_count_grant(sg->offset, sg->length); - ring_req->u.rw.id = id; - info->shadow[id].num_sg = num_sg; - if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + require_extra_req = info->max_indirect_segments == 0 && + num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST; + BUG_ON(!HAS_EXTRA_REQ && require_extra_req); + + rinfo->shadow[id].num_sg = num_sg; + if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST && + likely(!require_extra_req)) { /* * The indirect operation can only be a BLKIF_OP_READ or * BLKIF_OP_WRITE @@ -670,11 +776,31 @@ static int blkif_queue_rw_req(struct request *req) } } ring_req->u.rw.nr_segments = num_grant; + if (unlikely(require_extra_req)) { + extra_id = blkif_ring_get_request(rinfo, req, + &extra_ring_req); + /* + * Only the first request contains the scatter-gather + * list. + */ + rinfo->shadow[extra_id].num_sg = 0; + + blkif_setup_extra_req(ring_req, extra_ring_req); + + /* Link the 2 requests together */ + rinfo->shadow[extra_id].associated_id = id; + rinfo->shadow[id].associated_id = extra_id; + } } setup.ring_req = ring_req; setup.id = id; - for_each_sg(info->shadow[id].sg, sg, num_sg, i) { + + setup.require_extra_req = require_extra_req; + if (unlikely(require_extra_req)) + setup.extra_ring_req = extra_ring_req; + + for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); if (setup.need_copy) { @@ -694,12 +820,12 @@ static int blkif_queue_rw_req(struct request *req) if (setup.segments) kunmap_atomic(setup.segments); - info->ring.req_prod_pvt++; - /* Keep a private copy so we can reissue requests when recovering. */ - info->shadow[id].req = *ring_req; + rinfo->shadow[id].req = *ring_req; + if (unlikely(require_extra_req)) + rinfo->shadow[extra_id].req = *extra_ring_req; - if (new_persistent_gnts) + if (max_grefs > 0) gnttab_free_grant_references(setup.gref_head); return 0; @@ -711,27 +837,25 @@ static int blkif_queue_rw_req(struct request *req) * * @req: a request struct */ -static int blkif_queue_request(struct request *req) +static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo) { - struct blkfront_info *info = req->rq_disk->private_data; - - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED)) return 1; if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) - return blkif_queue_discard_req(req); + return blkif_queue_discard_req(req, rinfo); else - return blkif_queue_rw_req(req); + return blkif_queue_rw_req(req, rinfo); } -static inline void flush_requests(struct blkfront_info *info) +static inline void flush_requests(struct blkfront_ring_info *rinfo) { int notify; - RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify); if (notify) - notify_remote_via_irq(info->irq); + notify_remote_via_irq(rinfo->irq); } static inline bool blkif_request_flush_invalid(struct request *req, @@ -745,38 +869,50 @@ static inline bool blkif_request_flush_invalid(struct request *req, } static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *qd) + const struct blk_mq_queue_data *qd) { - struct blkfront_info *info = qd->rq->rq_disk->private_data; + unsigned long flags; + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data; blk_mq_start_request(qd->rq); - spin_lock_irq(&info->io_lock); - if (RING_FULL(&info->ring)) + spin_lock_irqsave(&rinfo->ring_lock, flags); + if (RING_FULL(&rinfo->ring)) goto out_busy; - if (blkif_request_flush_invalid(qd->rq, info)) + if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info)) goto out_err; - if (blkif_queue_request(qd->rq)) + if (blkif_queue_request(qd->rq, rinfo)) goto out_busy; - flush_requests(info); - spin_unlock_irq(&info->io_lock); + flush_requests(rinfo); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_MQ_RQ_QUEUE_OK; out_err: - spin_unlock_irq(&info->io_lock); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_MQ_RQ_QUEUE_ERROR; out_busy: - spin_unlock_irq(&info->io_lock); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); blk_mq_stop_hw_queue(hctx); return BLK_MQ_RQ_QUEUE_BUSY; } +static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int index) +{ + struct blkfront_info *info = (struct blkfront_info *)data; + + BUG_ON(info->nr_rings <= index); + hctx->driver_data = &info->rinfo[index]; + return 0; +} + static struct blk_mq_ops blkfront_mq_ops = { .queue_rq = blkif_queue_rq, .map_queue = blk_mq_map_queue, + .init_hctx = blk_mq_init_hctx, }; static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, @@ -788,19 +924,28 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, memset(&info->tag_set, 0, sizeof(info->tag_set)); info->tag_set.ops = &blkfront_mq_ops; - info->tag_set.nr_hw_queues = 1; - info->tag_set.queue_depth = BLK_RING_SIZE(info); + info->tag_set.nr_hw_queues = info->nr_rings; + if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) { + /* + * When indirect descriptior is not supported, the I/O request + * will be split between multiple request in the ring. + * To avoid problems when sending the request, divide by + * 2 the depth of the queue. + */ + info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2; + } else + info->tag_set.queue_depth = BLK_RING_SIZE(info); info->tag_set.numa_node = NUMA_NO_NODE; info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; info->tag_set.cmd_size = 0; info->tag_set.driver_data = info; if (blk_mq_alloc_tag_set(&info->tag_set)) - return -1; + return -EINVAL; rq = blk_mq_init_queue(&info->tag_set); if (IS_ERR(rq)) { blk_mq_free_tag_set(&info->tag_set); - return -1; + return PTR_ERR(rq); } queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); @@ -1028,7 +1173,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, static void xlvbd_release_gendisk(struct blkfront_info *info) { - unsigned int minor, nr_minors; + unsigned int minor, nr_minors, i; if (info->rq == NULL) return; @@ -1036,11 +1181,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) /* No more blkif_request(). */ blk_mq_stop_hw_queues(info->rq); - /* No more gnttab callback work. */ - gnttab_cancel_free_callback(&info->callback); + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo = &info->rinfo[i]; - /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work(&info->work); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&rinfo->callback); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_work(&rinfo->work); + } del_gendisk(info->gd); @@ -1056,88 +1205,87 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) info->gd = NULL; } -/* Must be called with io_lock holded */ -static void kick_pending_request_queues(struct blkfront_info *info) +/* Already hold rinfo->ring_lock. */ +static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo) { - if (!RING_FULL(&info->ring)) - blk_mq_start_stopped_hw_queues(info->rq, true); + if (!RING_FULL(&rinfo->ring)) + blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true); } -static void blkif_restart_queue(struct work_struct *work) +static void kick_pending_request_queues(struct blkfront_ring_info *rinfo) { - struct blkfront_info *info = container_of(work, struct blkfront_info, work); + unsigned long flags; - spin_lock_irq(&info->io_lock); - if (info->connected == BLKIF_STATE_CONNECTED) - kick_pending_request_queues(info); - spin_unlock_irq(&info->io_lock); + spin_lock_irqsave(&rinfo->ring_lock, flags); + kick_pending_request_queues_locked(rinfo); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); } -static void blkif_free(struct blkfront_info *info, int suspend) +static void blkif_restart_queue(struct work_struct *work) { - struct grant *persistent_gnt; - struct grant *n; - int i, j, segs; + struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work); - /* Prevent new requests being issued until we fix things up. */ - spin_lock_irq(&info->io_lock); - info->connected = suspend ? - BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; - /* No more blkif_request(). */ - if (info->rq) - blk_mq_stop_hw_queues(info->rq); + if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED) + kick_pending_request_queues(rinfo); +} - /* Remove all persistent grants */ - if (!list_empty(&info->grants)) { - list_for_each_entry_safe(persistent_gnt, n, - &info->grants, node) { - list_del(&persistent_gnt->node); - if (persistent_gnt->gref != GRANT_INVALID_REF) { - gnttab_end_foreign_access(persistent_gnt->gref, - 0, 0UL); - info->persistent_gnts_c--; - } - if (info->feature_persistent) - __free_page(persistent_gnt->page); - kfree(persistent_gnt); - } - } - BUG_ON(info->persistent_gnts_c != 0); +static void blkif_free_ring(struct blkfront_ring_info *rinfo) +{ + struct grant *persistent_gnt, *n; + struct blkfront_info *info = rinfo->dev_info; + int i, j, segs; /* * Remove indirect pages, this only happens when using indirect * descriptors but not persistent grants */ - if (!list_empty(&info->indirect_pages)) { + if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n; BUG_ON(info->feature_persistent); - list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); } } + /* Remove all persistent grants. */ + if (!list_empty(&rinfo->grants)) { + list_for_each_entry_safe(persistent_gnt, n, + &rinfo->grants, node) { + list_del(&persistent_gnt->node); + if (persistent_gnt->gref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(persistent_gnt->gref, + 0, 0UL); + rinfo->persistent_gnts_c--; + } + if (info->feature_persistent) + __free_page(persistent_gnt->page); + kfree(persistent_gnt); + } + } + BUG_ON(rinfo->persistent_gnts_c != 0); + for (i = 0; i < BLK_RING_SIZE(info); i++) { /* * Clear persistent grants present in requests already * on the shared ring */ - if (!info->shadow[i].request) + if (!rinfo->shadow[i].request) goto free_shadow; - segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? - info->shadow[i].req.u.indirect.nr_segments : - info->shadow[i].req.u.rw.nr_segments; + segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ? + rinfo->shadow[i].req.u.indirect.nr_segments : + rinfo->shadow[i].req.u.rw.nr_segments; for (j = 0; j < segs; j++) { - persistent_gnt = info->shadow[i].grants_used[j]; + persistent_gnt = rinfo->shadow[i].grants_used[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); if (info->feature_persistent) __free_page(persistent_gnt->page); kfree(persistent_gnt); } - if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) + if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT) /* * If this is not an indirect operation don't try to * free indirect segments @@ -1145,42 +1293,59 @@ static void blkif_free(struct blkfront_info *info, int suspend) goto free_shadow; for (j = 0; j < INDIRECT_GREFS(segs); j++) { - persistent_gnt = info->shadow[i].indirect_grants[j]; + persistent_gnt = rinfo->shadow[i].indirect_grants[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); __free_page(persistent_gnt->page); kfree(persistent_gnt); } free_shadow: - kfree(info->shadow[i].grants_used); - info->shadow[i].grants_used = NULL; - kfree(info->shadow[i].indirect_grants); - info->shadow[i].indirect_grants = NULL; - kfree(info->shadow[i].sg); - info->shadow[i].sg = NULL; + kfree(rinfo->shadow[i].grants_used); + rinfo->shadow[i].grants_used = NULL; + kfree(rinfo->shadow[i].indirect_grants); + rinfo->shadow[i].indirect_grants = NULL; + kfree(rinfo->shadow[i].sg); + rinfo->shadow[i].sg = NULL; } /* No more gnttab callback work. */ - gnttab_cancel_free_callback(&info->callback); - spin_unlock_irq(&info->io_lock); + gnttab_cancel_free_callback(&rinfo->callback); /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work(&info->work); + flush_work(&rinfo->work); /* Free resources associated with old device channel. */ for (i = 0; i < info->nr_ring_pages; i++) { - if (info->ring_ref[i] != GRANT_INVALID_REF) { - gnttab_end_foreign_access(info->ring_ref[i], 0, 0); - info->ring_ref[i] = GRANT_INVALID_REF; + if (rinfo->ring_ref[i] != GRANT_INVALID_REF) { + gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0); + rinfo->ring_ref[i] = GRANT_INVALID_REF; } } - free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); - info->ring.sring = NULL; + free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); + rinfo->ring.sring = NULL; - if (info->irq) - unbind_from_irqhandler(info->irq, info); - info->evtchn = info->irq = 0; + if (rinfo->irq) + unbind_from_irqhandler(rinfo->irq, rinfo); + rinfo->evtchn = rinfo->irq = 0; +} +static void blkif_free(struct blkfront_info *info, int suspend) +{ + unsigned int i; + + /* Prevent new requests being issued until we fix things up. */ + info->connected = suspend ? + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + /* No more blkif_request(). */ + if (info->rq) + blk_mq_stop_hw_queues(info->rq); + + for (i = 0; i < info->nr_rings; i++) + blkif_free_ring(&info->rinfo[i]); + + kfree(info->rinfo); + info->rinfo = NULL; + info->nr_rings = 0; } struct copy_from_grant { @@ -1209,19 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset, kunmap_atomic(shared_data); } -static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, +static enum blk_req_status blkif_rsp_to_req_status(int rsp) +{ + switch (rsp) + { + case BLKIF_RSP_OKAY: + return REQ_DONE; + case BLKIF_RSP_EOPNOTSUPP: + return REQ_EOPNOTSUPP; + case BLKIF_RSP_ERROR: + /* Fallthrough. */ + default: + return REQ_ERROR; + } +} + +/* + * Get the final status of the block request based on two ring response + */ +static int blkif_get_final_status(enum blk_req_status s1, + enum blk_req_status s2) +{ + BUG_ON(s1 == REQ_WAITING); + BUG_ON(s2 == REQ_WAITING); + + if (s1 == REQ_ERROR || s2 == REQ_ERROR) + return BLKIF_RSP_ERROR; + else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP) + return BLKIF_RSP_EOPNOTSUPP; + return BLKIF_RSP_OKAY; +} + +static bool blkif_completion(unsigned long *id, + struct blkfront_ring_info *rinfo, struct blkif_response *bret) { int i = 0; struct scatterlist *sg; int num_sg, num_grant; + struct blkfront_info *info = rinfo->dev_info; + struct blk_shadow *s = &rinfo->shadow[*id]; struct copy_from_grant data = { - .s = s, .grant_idx = 0, }; num_grant = s->req.operation == BLKIF_OP_INDIRECT ? s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; + + /* The I/O request may be split in two. */ + if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) { + struct blk_shadow *s2 = &rinfo->shadow[s->associated_id]; + + /* Keep the status of the current response in shadow. */ + s->status = blkif_rsp_to_req_status(bret->status); + + /* Wait the second response if not yet here. */ + if (s2->status == REQ_WAITING) + return 0; + + bret->status = blkif_get_final_status(s->status, + s2->status); + + /* + * All the grants is stored in the first shadow in order + * to make the completion code simpler. + */ + num_grant += s2->req.u.rw.nr_segments; + + /* + * The two responses may not come in order. Only the + * first request will store the scatter-gather list. + */ + if (s2->num_sg != 0) { + /* Update "id" with the ID of the first response. */ + *id = s->associated_id; + s = s2; + } + + /* + * We don't need anymore the second request, so recycling + * it now. + */ + if (add_id_to_freelist(rinfo, s->associated_id)) + WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n", + info->gd->disk_name, s->associated_id); + } + + data.s = s; num_sg = s->num_sg; if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { @@ -1252,8 +1491,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", s->grants_used[i]->gref); - list_add(&s->grants_used[i]->node, &info->grants); - info->persistent_gnts_c++; + list_add(&s->grants_used[i]->node, &rinfo->grants); + rinfo->persistent_gnts_c++; } else { /* * If the grant is not mapped by the backend we end the @@ -1263,7 +1502,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, */ gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); s->grants_used[i]->gref = GRANT_INVALID_REF; - list_add_tail(&s->grants_used[i]->node, &info->grants); + list_add_tail(&s->grants_used[i]->node, &rinfo->grants); } } if (s->req.operation == BLKIF_OP_INDIRECT) { @@ -1272,8 +1511,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", s->indirect_grants[i]->gref); - list_add(&s->indirect_grants[i]->node, &info->grants); - info->persistent_gnts_c++; + list_add(&s->indirect_grants[i]->node, &rinfo->grants); + rinfo->persistent_gnts_c++; } else { struct page *indirect_page; @@ -1284,13 +1523,15 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, */ if (!info->feature_persistent) { indirect_page = s->indirect_grants[i]->page; - list_add(&indirect_page->lru, &info->indirect_pages); + list_add(&indirect_page->lru, &rinfo->indirect_pages); } s->indirect_grants[i]->gref = GRANT_INVALID_REF; - list_add_tail(&s->indirect_grants[i]->node, &info->grants); + list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants); } } } + + return 1; } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -1299,24 +1540,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) struct blkif_response *bret; RING_IDX i, rp; unsigned long flags; - struct blkfront_info *info = (struct blkfront_info *)dev_id; + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id; + struct blkfront_info *info = rinfo->dev_info; int error; - spin_lock_irqsave(&info->io_lock, flags); - - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { - spin_unlock_irqrestore(&info->io_lock, flags); + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) return IRQ_HANDLED; - } + spin_lock_irqsave(&rinfo->ring_lock, flags); again: - rp = info->ring.sring->rsp_prod; + rp = rinfo->ring.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ - for (i = info->ring.rsp_cons; i != rp; i++) { + for (i = rinfo->ring.rsp_cons; i != rp; i++) { unsigned long id; - bret = RING_GET_RESPONSE(&info->ring, i); + bret = RING_GET_RESPONSE(&rinfo->ring, i); id = bret->id; /* * The backend has messed up and given us an id that we would @@ -1330,12 +1569,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) * the id is busted. */ continue; } - req = info->shadow[id].request; + req = rinfo->shadow[id].request; - if (bret->operation != BLKIF_OP_DISCARD) - blkif_completion(&info->shadow[id], info, bret); + if (bret->operation != BLKIF_OP_DISCARD) { + /* + * We may need to wait for an extra response if the + * I/O request is split in 2 + */ + if (!blkif_completion(&id, rinfo, bret)) + continue; + } - if (add_id_to_freelist(info, id)) { + if (add_id_to_freelist(rinfo, id)) { WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", info->gd->disk_name, op_name(bret->operation), id); continue; @@ -1364,7 +1609,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) error = -EOPNOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && - info->shadow[id].req.u.rw.nr_segments == 0)) { + rinfo->shadow[id].req.u.rw.nr_segments == 0)) { printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", info->gd->disk_name, op_name(bret->operation)); error = -EOPNOTSUPP; @@ -1389,34 +1634,35 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) } } - info->ring.rsp_cons = i; + rinfo->ring.rsp_cons = i; - if (i != info->ring.req_prod_pvt) { + if (i != rinfo->ring.req_prod_pvt) { int more_to_do; - RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do); if (more_to_do) goto again; } else - info->ring.sring->rsp_event = i + 1; + rinfo->ring.sring->rsp_event = i + 1; - kick_pending_request_queues(info); + kick_pending_request_queues_locked(rinfo); - spin_unlock_irqrestore(&info->io_lock, flags); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return IRQ_HANDLED; } static int setup_blkring(struct xenbus_device *dev, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { struct blkif_sring *sring; int err, i; + struct blkfront_info *info = rinfo->dev_info; unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; for (i = 0; i < info->nr_ring_pages; i++) - info->ring_ref[i] = GRANT_INVALID_REF; + rinfo->ring_ref[i] = GRANT_INVALID_REF; sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, get_order(ring_size)); @@ -1425,29 +1671,29 @@ static int setup_blkring(struct xenbus_device *dev, return -ENOMEM; } SHARED_RING_INIT(sring); - FRONT_RING_INIT(&info->ring, sring, ring_size); + FRONT_RING_INIT(&rinfo->ring, sring, ring_size); - err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref); + err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref); if (err < 0) { free_pages((unsigned long)sring, get_order(ring_size)); - info->ring.sring = NULL; + rinfo->ring.sring = NULL; goto fail; } for (i = 0; i < info->nr_ring_pages; i++) - info->ring_ref[i] = gref[i]; + rinfo->ring_ref[i] = gref[i]; - err = xenbus_alloc_evtchn(dev, &info->evtchn); + err = xenbus_alloc_evtchn(dev, &rinfo->evtchn); if (err) goto fail; - err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, - "blkif", info); + err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0, + "blkif", rinfo); if (err <= 0) { xenbus_dev_fatal(dev, err, "bind_evtchn_to_irqhandler failed"); goto fail; } - info->irq = err; + rinfo->irq = err; return 0; fail: @@ -1455,6 +1701,53 @@ fail: return err; } +/* + * Write out per-ring/queue nodes including ring-ref and event-channel, and each + * ring buffer may have multi pages depending on ->nr_ring_pages. + */ +static int write_per_ring_nodes(struct xenbus_transaction xbt, + struct blkfront_ring_info *rinfo, const char *dir) +{ + int err; + unsigned int i; + const char *message = NULL; + struct blkfront_info *info = rinfo->dev_info; + + if (info->nr_ring_pages == 1) { + err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } else { + for (i = 0; i < info->nr_ring_pages; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_printf(xbt, dir, ring_ref_name, + "%u", rinfo->ring_ref[i]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } + } + + err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + return 0; + +abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_fatal(info->xbdev, err, "%s", message); + + return err; +} /* Common code used when first setting up, and when resuming. */ static int talk_to_blkback(struct xenbus_device *dev, @@ -1462,8 +1755,8 @@ static int talk_to_blkback(struct xenbus_device *dev, { const char *message = NULL; struct xenbus_transaction xbt; - int err, i; - unsigned int max_page_order = 0; + int err; + unsigned int i, max_page_order = 0; unsigned int ring_page_order = 0; err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, @@ -1475,10 +1768,14 @@ static int talk_to_blkback(struct xenbus_device *dev, info->nr_ring_pages = 1 << ring_page_order; } - /* Create shared ring, alloc event channel. */ - err = setup_blkring(dev, info); - if (err) - goto out; + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo = &info->rinfo[i]; + + /* Create shared ring, alloc event channel. */ + err = setup_blkring(dev, rinfo); + if (err) + goto destroy_blkring; + } again: err = xenbus_transaction_start(&xbt); @@ -1487,38 +1784,49 @@ again: goto destroy_blkring; } - if (info->nr_ring_pages == 1) { - err = xenbus_printf(xbt, dev->nodename, - "ring-ref", "%u", info->ring_ref[0]); + if (info->nr_ring_pages > 1) { + err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u", + ring_page_order); if (err) { - message = "writing ring-ref"; + message = "writing ring-page-order"; goto abort_transaction; } + } + + /* We already got the number of queues/rings in _probe */ + if (info->nr_rings == 1) { + err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename); + if (err) + goto destroy_blkring; } else { - err = xenbus_printf(xbt, dev->nodename, - "ring-page-order", "%u", ring_page_order); + char *path; + size_t pathsize; + + err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u", + info->nr_rings); if (err) { - message = "writing ring-page-order"; + message = "writing multi-queue-num-queues"; goto abort_transaction; } - for (i = 0; i < info->nr_ring_pages; i++) { - char ring_ref_name[RINGREF_NAME_LEN]; + pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN; + path = kmalloc(pathsize, GFP_KERNEL); + if (!path) { + err = -ENOMEM; + message = "ENOMEM while writing ring references"; + goto abort_transaction; + } - snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_printf(xbt, dev->nodename, ring_ref_name, - "%u", info->ring_ref[i]); + for (i = 0; i < info->nr_rings; i++) { + memset(path, 0, pathsize); + snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i); + err = write_per_ring_nodes(xbt, &info->rinfo[i], path); if (err) { - message = "writing ring-ref"; - goto abort_transaction; + kfree(path); + goto destroy_blkring; } } - } - err = xenbus_printf(xbt, dev->nodename, - "event-channel", "%u", info->evtchn); - if (err) { - message = "writing event-channel"; - goto abort_transaction; + kfree(path); } err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE); @@ -1540,9 +1848,14 @@ again: goto destroy_blkring; } - for (i = 0; i < BLK_RING_SIZE(info); i++) - info->shadow[i].req.u.rw.id = i+1; - info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + for (i = 0; i < info->nr_rings; i++) { + unsigned int j; + struct blkfront_ring_info *rinfo = &info->rinfo[i]; + + for (j = 0; j < BLK_RING_SIZE(info); j++) + rinfo->shadow[j].req.u.rw.id = j + 1; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + } xenbus_switch_state(dev, XenbusStateInitialised); return 0; @@ -1553,7 +1866,10 @@ again: xenbus_dev_fatal(dev, err, "%s", message); destroy_blkring: blkif_free(info, 0); - out: + + kfree(info); + dev_set_drvdata(&dev->dev, NULL); + return err; } @@ -1567,7 +1883,9 @@ static int blkfront_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) { int err, vdevice; + unsigned int r_index; struct blkfront_info *info; + unsigned int backend_max_queues = 0; /* FIXME: Use dynamic device id if this is not set. */ err = xenbus_scanf(XBT_NIL, dev->nodename, @@ -1617,15 +1935,39 @@ static int blkfront_probe(struct xenbus_device *dev, return -ENOMEM; } - mutex_init(&info->mutex); - spin_lock_init(&info->io_lock); info->xbdev = dev; + /* Check if backend supports multiple queues. */ + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "multi-queue-max-queues", "%u", &backend_max_queues); + if (err < 0) + backend_max_queues = 1; + + info->nr_rings = min(backend_max_queues, xen_blkif_max_queues); + /* We need at least one ring. */ + if (!info->nr_rings) + info->nr_rings = 1; + + info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL); + if (!info->rinfo) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure"); + kfree(info); + return -ENOMEM; + } + + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + INIT_LIST_HEAD(&rinfo->indirect_pages); + INIT_LIST_HEAD(&rinfo->grants); + rinfo->dev_info = info; + INIT_WORK(&rinfo->work, blkif_restart_queue); + spin_lock_init(&rinfo->ring_lock); + } + + mutex_init(&info->mutex); info->vdevice = vdevice; - INIT_LIST_HEAD(&info->grants); - INIT_LIST_HEAD(&info->indirect_pages); - info->persistent_gnts_c = 0; info->connected = BLKIF_STATE_DISCONNECTED; - INIT_WORK(&info->work, blkif_restart_queue); /* Front end dir is a number, which is used as the id. */ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); @@ -1649,7 +1991,7 @@ static void split_bio_end(struct bio *bio) static int blkif_recover(struct blkfront_info *info) { - int i; + unsigned int i, r_index; struct request *req, *n; struct blk_shadow *copy; int rc; @@ -1660,64 +2002,73 @@ static int blkif_recover(struct blkfront_info *info) struct split_bio *split_bio; struct list_head requests; - /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmemdup(info->shadow, sizeof(info->shadow), - GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); - if (!copy) - return -ENOMEM; - - /* Stage 2: Set up free list. */ - memset(&info->shadow, 0, sizeof(info->shadow)); - for (i = 0; i < BLK_RING_SIZE(info); i++) - info->shadow[i].req.u.rw.id = i+1; - info->shadow_free = info->ring.req_prod_pvt; - info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; - - rc = blkfront_gather_backend_features(info); - if (rc) { - kfree(copy); - return rc; - } - + blkfront_gather_backend_features(info); segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; blk_queue_max_segments(info->rq, segs); bio_list_init(&bio_list); INIT_LIST_HEAD(&requests); - for (i = 0; i < BLK_RING_SIZE(info); i++) { - /* Not in use? */ - if (!copy[i].request) - continue; - /* - * Get the bios in the request so we can re-queue them. - */ - if (copy[i].request->cmd_flags & - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + /* Stage 1: Make a safe copy of the shadow state. */ + copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow), + GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); + if (!copy) + return -ENOMEM; + + /* Stage 2: Set up free list. */ + memset(&rinfo->shadow, 0, sizeof(rinfo->shadow)); + for (i = 0; i < BLK_RING_SIZE(info); i++) + rinfo->shadow[i].req.u.rw.id = i+1; + rinfo->shadow_free = rinfo->ring.req_prod_pvt; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + + rc = blkfront_setup_indirect(rinfo); + if (rc) { + kfree(copy); + return rc; + } + + for (i = 0; i < BLK_RING_SIZE(info); i++) { + /* Not in use? */ + if (!copy[i].request) + continue; + /* - * Flush operations don't contain bios, so - * we need to requeue the whole request + * Get the bios in the request so we can re-queue them. */ - list_add(©[i].request->queuelist, &requests); - continue; + if (copy[i].request->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + /* + * Flush operations don't contain bios, so + * we need to requeue the whole request + */ + list_add(©[i].request->queuelist, &requests); + continue; + } + merge_bio.head = copy[i].request->bio; + merge_bio.tail = copy[i].request->biotail; + bio_list_merge(&bio_list, &merge_bio); + copy[i].request->bio = NULL; + blk_end_request_all(copy[i].request, 0); } - merge_bio.head = copy[i].request->bio; - merge_bio.tail = copy[i].request->biotail; - bio_list_merge(&bio_list, &merge_bio); - copy[i].request->bio = NULL; - blk_end_request_all(copy[i].request, 0); - } - - kfree(copy); + kfree(copy); + } xenbus_switch_state(info->xbdev, XenbusStateConnected); - spin_lock_irq(&info->io_lock); - /* Now safe for us to use the shared ring */ info->connected = BLKIF_STATE_CONNECTED; - /* Kick any other new requests queued since we resumed */ - kick_pending_request_queues(info); + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + /* Kick any other new requests queued since we resumed */ + kick_pending_request_queues(rinfo); + } list_for_each_entry_safe(req, n, &requests, queuelist) { /* Requeue pending requests (flush or discard) */ @@ -1725,7 +2076,6 @@ static int blkif_recover(struct blkfront_info *info) BUG_ON(req->nr_phys_segments > segs); blk_mq_requeue_request(req); } - spin_unlock_irq(&info->io_lock); blk_mq_kick_requeue_list(info->rq); while ((bio = bio_list_pop(&bio_list)) != NULL) { @@ -1790,8 +2140,7 @@ static int blkfront_resume(struct xenbus_device *dev) return err; } -static void -blkfront_closing(struct blkfront_info *info) +static void blkfront_closing(struct blkfront_info *info) { struct xenbus_device *xbdev = info->xbdev; struct block_device *bdev = NULL; @@ -1851,18 +2200,29 @@ static void blkfront_setup_discard(struct blkfront_info *info) info->feature_secdiscard = !!discard_secure; } -static int blkfront_setup_indirect(struct blkfront_info *info) +static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) { unsigned int psegs, grants; int err, i; + struct blkfront_info *info = rinfo->dev_info; - if (info->max_indirect_segments == 0) - grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; + if (info->max_indirect_segments == 0) { + if (!HAS_EXTRA_REQ) + grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; + else { + /* + * When an extra req is required, the maximum + * grants supported is related to the size of the + * Linux block segment. + */ + grants = GRANTS_PER_PSEG; + } + } else grants = info->max_indirect_segments; psegs = grants / GRANTS_PER_PSEG; - err = fill_grant_buffer(info, + err = fill_grant_buffer(rinfo, (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info)); if (err) goto out_of_memory; @@ -1875,31 +2235,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info) */ int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); - BUG_ON(!list_empty(&info->indirect_pages)); + BUG_ON(!list_empty(&rinfo->indirect_pages)); for (i = 0; i < num; i++) { struct page *indirect_page = alloc_page(GFP_NOIO); if (!indirect_page) goto out_of_memory; - list_add(&indirect_page->lru, &info->indirect_pages); + list_add(&indirect_page->lru, &rinfo->indirect_pages); } } for (i = 0; i < BLK_RING_SIZE(info); i++) { - info->shadow[i].grants_used = kzalloc( - sizeof(info->shadow[i].grants_used[0]) * grants, + rinfo->shadow[i].grants_used = kzalloc( + sizeof(rinfo->shadow[i].grants_used[0]) * grants, GFP_NOIO); - info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO); + rinfo->shadow[i].sg = kzalloc(sizeof(rinfo->shadow[i].sg[0]) * psegs, GFP_NOIO); if (info->max_indirect_segments) - info->shadow[i].indirect_grants = kzalloc( - sizeof(info->shadow[i].indirect_grants[0]) * + rinfo->shadow[i].indirect_grants = kzalloc( + sizeof(rinfo->shadow[i].indirect_grants[0]) * INDIRECT_GREFS(grants), GFP_NOIO); - if ((info->shadow[i].grants_used == NULL) || - (info->shadow[i].sg == NULL) || + if ((rinfo->shadow[i].grants_used == NULL) || + (rinfo->shadow[i].sg == NULL) || (info->max_indirect_segments && - (info->shadow[i].indirect_grants == NULL))) + (rinfo->shadow[i].indirect_grants == NULL))) goto out_of_memory; - sg_init_table(info->shadow[i].sg, psegs); + sg_init_table(rinfo->shadow[i].sg, psegs); } @@ -1907,16 +2267,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info) out_of_memory: for (i = 0; i < BLK_RING_SIZE(info); i++) { - kfree(info->shadow[i].grants_used); - info->shadow[i].grants_used = NULL; - kfree(info->shadow[i].sg); - info->shadow[i].sg = NULL; - kfree(info->shadow[i].indirect_grants); - info->shadow[i].indirect_grants = NULL; - } - if (!list_empty(&info->indirect_pages)) { + kfree(rinfo->shadow[i].grants_used); + rinfo->shadow[i].grants_used = NULL; + kfree(rinfo->shadow[i].sg); + rinfo->shadow[i].sg = NULL; + kfree(rinfo->shadow[i].indirect_grants); + rinfo->shadow[i].indirect_grants = NULL; + } + if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n; - list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); } @@ -1927,7 +2287,7 @@ out_of_memory: /* * Gather all backend feature-* */ -static int blkfront_gather_backend_features(struct blkfront_info *info) +static void blkfront_gather_backend_features(struct blkfront_info *info) { int err; int barrier, flush, discard, persistent; @@ -1982,8 +2342,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info) else info->max_indirect_segments = min(indirect_segments, xen_blkif_max_segments); - - return blkfront_setup_indirect(info); } /* @@ -1996,7 +2354,7 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long sector_size; unsigned int physical_sector_size; unsigned int binfo; - int err; + int err, i; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -2053,11 +2411,15 @@ static void blkfront_connect(struct blkfront_info *info) if (err != 1) physical_sector_size = sector_size; - err = blkfront_gather_backend_features(info); - if (err) { - xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", - info->xbdev->otherend); - return; + blkfront_gather_backend_features(info); + for (i = 0; i < info->nr_rings; i++) { + err = blkfront_setup_indirect(&info->rinfo[i]); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", + info->xbdev->otherend); + blkif_free(info, 0); + break; + } } err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, @@ -2071,10 +2433,9 @@ static void blkfront_connect(struct blkfront_info *info) xenbus_switch_state(info->xbdev, XenbusStateConnected); /* Kick pending requests. */ - spin_lock_irq(&info->io_lock); info->connected = BLKIF_STATE_CONNECTED; - kick_pending_request_queues(info); - spin_unlock_irq(&info->io_lock); + for (i = 0; i < info->nr_rings; i++) + kick_pending_request_queues(&info->rinfo[i]); add_disk(info->gd); @@ -2095,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev, case XenbusStateInitWait: if (dev->state != XenbusStateInitialising) break; - if (talk_to_blkback(dev, info)) { - kfree(info); - dev_set_drvdata(&dev->dev, NULL); + if (talk_to_blkback(dev, info)) break; - } case XenbusStateInitialising: case XenbusStateInitialised: case XenbusStateReconfiguring: @@ -2108,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev, break; case XenbusStateConnected: + if (dev->state != XenbusStateInitialised) { + if (talk_to_blkback(dev, info)) + break; + } blkfront_connect(info); break; @@ -2281,6 +2643,7 @@ static struct xenbus_driver blkfront_driver = { static int __init xlblk_init(void) { int ret; + int nr_cpus = num_online_cpus(); if (!xen_domain()) return -ENODEV; @@ -2288,7 +2651,13 @@ static int __init xlblk_init(void) if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); - xen_blkif_max_ring_order = 0; + xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; + } + + if (xen_blkif_max_queues > nr_cpus) { + pr_info("Invalid max_queues (%d), will use default max: %d.\n", + xen_blkif_max_queues, nr_cpus); + xen_blkif_max_queues = nr_cpus; } if (!xen_has_pv_disk_devices()) diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile index 7e0f42acb737..a7a0a22cf1a5 100644 --- a/drivers/lightnvm/Makefile +++ b/drivers/lightnvm/Makefile @@ -2,6 +2,6 @@ # Makefile for Open-Channel SSDs. # -obj-$(CONFIG_NVM) := core.o +obj-$(CONFIG_NVM) := core.o sysblk.o obj-$(CONFIG_NVM_GENNVM) += gennvm.o obj-$(CONFIG_NVM_RRPC) += rrpc.o diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 8f41b245cd55..33224cb91c5b 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -27,6 +27,7 @@ #include <linux/module.h> #include <linux/miscdevice.h> #include <linux/lightnvm.h> +#include <linux/sched/sysctl.h> #include <uapi/linux/lightnvm.h> static LIST_HEAD(nvm_targets); @@ -105,6 +106,9 @@ struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev) lockdep_assert_held(&nvm_lock); list_for_each_entry(mt, &nvm_mgrs, list) { + if (strncmp(dev->sb.mmtype, mt->name, NVM_MMTYPE_LEN)) + continue; + ret = mt->register_mgr(dev); if (ret < 0) { pr_err("nvm: media mgr failed to init (%d) on dev %s\n", @@ -166,6 +170,20 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name) return NULL; } +struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *dev, struct nvm_lun *lun, + unsigned long flags) +{ + return dev->mt->get_blk_unlocked(dev, lun, flags); +} +EXPORT_SYMBOL(nvm_get_blk_unlocked); + +/* Assumes that all valid pages have already been moved on release to bm */ +void nvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk) +{ + return dev->mt->put_blk_unlocked(dev, blk); +} +EXPORT_SYMBOL(nvm_put_blk_unlocked); + struct nvm_block *nvm_get_blk(struct nvm_dev *dev, struct nvm_lun *lun, unsigned long flags) { @@ -192,6 +210,206 @@ int nvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk) } EXPORT_SYMBOL(nvm_erase_blk); +void nvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + int i; + + if (rqd->nr_pages > 1) { + for (i = 0; i < rqd->nr_pages; i++) + rqd->ppa_list[i] = dev_to_generic_addr(dev, + rqd->ppa_list[i]); + } else { + rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr); + } +} +EXPORT_SYMBOL(nvm_addr_to_generic_mode); + +void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + int i; + + if (rqd->nr_pages > 1) { + for (i = 0; i < rqd->nr_pages; i++) + rqd->ppa_list[i] = generic_to_dev_addr(dev, + rqd->ppa_list[i]); + } else { + rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr); + } +} +EXPORT_SYMBOL(nvm_generic_to_addr_mode); + +int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd, + struct ppa_addr *ppas, int nr_ppas) +{ + int i, plane_cnt, pl_idx; + + if (dev->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { + rqd->nr_pages = 1; + rqd->ppa_addr = ppas[0]; + + return 0; + } + + plane_cnt = (1 << dev->plane_mode); + rqd->nr_pages = plane_cnt * nr_ppas; + + if (dev->ops->max_phys_sect < rqd->nr_pages) + return -EINVAL; + + rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); + if (!rqd->ppa_list) { + pr_err("nvm: failed to allocate dma memory\n"); + return -ENOMEM; + } + + for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { + for (i = 0; i < nr_ppas; i++) { + ppas[i].g.pl = pl_idx; + rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppas[i]; + } + } + + return 0; +} +EXPORT_SYMBOL(nvm_set_rqd_ppalist); + +void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + if (!rqd->ppa_list) + return; + + nvm_dev_dma_free(dev, rqd->ppa_list, rqd->dma_ppa_list); +} +EXPORT_SYMBOL(nvm_free_rqd_ppalist); + +int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas) +{ + struct nvm_rq rqd; + int ret; + + if (!dev->ops->erase_block) + return 0; + + memset(&rqd, 0, sizeof(struct nvm_rq)); + + ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas); + if (ret) + return ret; + + nvm_generic_to_addr_mode(dev, &rqd); + + ret = dev->ops->erase_block(dev, &rqd); + + nvm_free_rqd_ppalist(dev, &rqd); + + return ret; +} +EXPORT_SYMBOL(nvm_erase_ppa); + +void nvm_end_io(struct nvm_rq *rqd, int error) +{ + rqd->error = error; + rqd->end_io(rqd); +} +EXPORT_SYMBOL(nvm_end_io); + +static void nvm_end_io_sync(struct nvm_rq *rqd) +{ + struct completion *waiting = rqd->wait; + + rqd->wait = NULL; + + complete(waiting); +} + +int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas, + int opcode, int flags, void *buf, int len) +{ + DECLARE_COMPLETION_ONSTACK(wait); + struct nvm_rq rqd; + struct bio *bio; + int ret; + unsigned long hang_check; + + bio = bio_map_kern(dev->q, buf, len, GFP_KERNEL); + if (IS_ERR_OR_NULL(bio)) + return -ENOMEM; + + memset(&rqd, 0, sizeof(struct nvm_rq)); + ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas); + if (ret) { + bio_put(bio); + return ret; + } + + rqd.opcode = opcode; + rqd.bio = bio; + rqd.wait = &wait; + rqd.dev = dev; + rqd.end_io = nvm_end_io_sync; + rqd.flags = flags; + nvm_generic_to_addr_mode(dev, &rqd); + + ret = dev->ops->submit_io(dev, &rqd); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + if (hang_check) + while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); + else + wait_for_completion_io(&wait); + + nvm_free_rqd_ppalist(dev, &rqd); + + return rqd.error; +} +EXPORT_SYMBOL(nvm_submit_ppa); + +static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) +{ + int i; + + dev->lps_per_blk = dev->pgs_per_blk; + dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL); + if (!dev->lptbl) + return -ENOMEM; + + /* Just a linear array */ + for (i = 0; i < dev->lps_per_blk; i++) + dev->lptbl[i] = i; + + return 0; +} + +static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) +{ + int i, p; + struct nvm_id_lp_mlc *mlc = &grp->lptbl.mlc; + + if (!mlc->num_pairs) + return 0; + + dev->lps_per_blk = mlc->num_pairs; + dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL); + if (!dev->lptbl) + return -ENOMEM; + + /* The lower page table encoding consists of a list of bytes, where each + * has a lower and an upper half. The first half byte maintains the + * increment value and every value after is an offset added to the + * previous incrementation value */ + dev->lptbl[0] = mlc->pairs[0] & 0xF; + for (i = 1; i < dev->lps_per_blk; i++) { + p = mlc->pairs[i >> 1]; + if (i & 0x1) /* upper */ + dev->lptbl[i] = dev->lptbl[i - 1] + ((p & 0xF0) >> 4); + else /* lower */ + dev->lptbl[i] = dev->lptbl[i - 1] + (p & 0xF); + } + + return 0; +} + static int nvm_core_init(struct nvm_dev *dev) { struct nvm_id *id = &dev->identity; @@ -206,6 +424,7 @@ static int nvm_core_init(struct nvm_dev *dev) dev->sec_size = grp->csecs; dev->oob_size = grp->sos; dev->sec_per_pg = grp->fpg_sz / grp->csecs; + dev->mccap = grp->mccap; memcpy(&dev->ppaf, &id->ppaf, sizeof(struct nvm_addr_format)); dev->plane_mode = NVM_PLANE_SINGLE; @@ -216,11 +435,23 @@ static int nvm_core_init(struct nvm_dev *dev) return -EINVAL; } - if (grp->fmtype != 0 && grp->fmtype != 1) { + switch (grp->fmtype) { + case NVM_ID_FMTYPE_SLC: + if (nvm_init_slc_tbl(dev, grp)) + return -ENOMEM; + break; + case NVM_ID_FMTYPE_MLC: + if (nvm_init_mlc_tbl(dev, grp)) + return -ENOMEM; + break; + default: pr_err("nvm: flash type not supported\n"); return -EINVAL; } + if (!dev->lps_per_blk) + pr_info("nvm: lower page programming table missing\n"); + if (grp->mpos & 0x020202) dev->plane_mode = NVM_PLANE_DOUBLE; if (grp->mpos & 0x040404) @@ -238,6 +469,7 @@ static int nvm_core_init(struct nvm_dev *dev) dev->nr_chnls; dev->total_pages = dev->total_blocks * dev->pgs_per_blk; INIT_LIST_HEAD(&dev->online_targets); + mutex_init(&dev->mlock); return 0; } @@ -249,6 +481,8 @@ static void nvm_free(struct nvm_dev *dev) if (dev->mt) dev->mt->unregister_mgr(dev); + + kfree(dev->lptbl); } static int nvm_init(struct nvm_dev *dev) @@ -338,9 +572,16 @@ int nvm_register(struct request_queue *q, char *disk_name, } } + ret = nvm_get_sysblock(dev, &dev->sb); + if (!ret) + pr_err("nvm: device not initialized.\n"); + else if (ret < 0) + pr_err("nvm: err (%d) on device initialization\n", ret); + /* register device with a supported media manager */ down_write(&nvm_lock); - dev->mt = nvm_init_mgr(dev); + if (ret > 0) + dev->mt = nvm_init_mgr(dev); list_add(&dev->devices, &nvm_devices); up_write(&nvm_lock); @@ -788,6 +1029,97 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) return __nvm_configure_remove(&remove); } +static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info) +{ + info->seqnr = 1; + info->erase_cnt = 0; + info->version = 1; +} + +static long __nvm_ioctl_dev_init(struct nvm_ioctl_dev_init *init) +{ + struct nvm_dev *dev; + struct nvm_sb_info info; + int ret; + + down_write(&nvm_lock); + dev = nvm_find_nvm_dev(init->dev); + up_write(&nvm_lock); + if (!dev) { + pr_err("nvm: device not found\n"); + return -EINVAL; + } + + nvm_setup_nvm_sb_info(&info); + + strncpy(info.mmtype, init->mmtype, NVM_MMTYPE_LEN); + info.fs_ppa.ppa = -1; + + ret = nvm_init_sysblock(dev, &info); + if (ret) + return ret; + + memcpy(&dev->sb, &info, sizeof(struct nvm_sb_info)); + + down_write(&nvm_lock); + dev->mt = nvm_init_mgr(dev); + up_write(&nvm_lock); + + return 0; +} + +static long nvm_ioctl_dev_init(struct file *file, void __user *arg) +{ + struct nvm_ioctl_dev_init init; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&init, arg, sizeof(struct nvm_ioctl_dev_init))) + return -EFAULT; + + if (init.flags != 0) { + pr_err("nvm: no flags supported\n"); + return -EINVAL; + } + + init.dev[DISK_NAME_LEN - 1] = '\0'; + + return __nvm_ioctl_dev_init(&init); +} + +static long nvm_ioctl_dev_factory(struct file *file, void __user *arg) +{ + struct nvm_ioctl_dev_factory fact; + struct nvm_dev *dev; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&fact, arg, sizeof(struct nvm_ioctl_dev_factory))) + return -EFAULT; + + fact.dev[DISK_NAME_LEN - 1] = '\0'; + + if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1)) + return -EINVAL; + + down_write(&nvm_lock); + dev = nvm_find_nvm_dev(fact.dev); + up_write(&nvm_lock); + if (!dev) { + pr_err("nvm: device not found\n"); + return -EINVAL; + } + + if (dev->mt) { + dev->mt->unregister_mgr(dev); + dev->mt = NULL; + } + + return nvm_dev_factory(dev, fact.flags); +} + static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg) { void __user *argp = (void __user *)arg; @@ -801,6 +1133,10 @@ static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg) return nvm_ioctl_dev_create(file, argp); case NVM_DEV_REMOVE: return nvm_ioctl_dev_remove(file, argp); + case NVM_DEV_INIT: + return nvm_ioctl_dev_init(file, argp); + case NVM_DEV_FACTORY: + return nvm_ioctl_dev_factory(file, argp); } return 0; } diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index a54b339951a3..7fb725b16148 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -60,7 +60,8 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) lun->vlun.lun_id = i % dev->luns_per_chnl; lun->vlun.chnl_id = i / dev->luns_per_chnl; lun->vlun.nr_free_blocks = dev->blks_per_lun; - lun->vlun.nr_inuse_blocks = 0; + lun->vlun.nr_open_blocks = 0; + lun->vlun.nr_closed_blocks = 0; lun->vlun.nr_bad_blocks = 0; } return 0; @@ -89,6 +90,7 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks, list_move_tail(&blk->list, &lun->bb_list); lun->vlun.nr_bad_blocks++; + lun->vlun.nr_free_blocks--; } return 0; @@ -133,15 +135,15 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) pba = pba - (dev->sec_per_lun * lun_id); blk = &lun->vlun.blocks[div_u64(pba, dev->sec_per_blk)]; - if (!blk->type) { + if (!blk->state) { /* at this point, we don't know anything about the * block. It's up to the FTL on top to re-etablish the - * block state + * block state. The block is assumed to be open. */ list_move_tail(&blk->list, &lun->used_list); - blk->type = 1; + blk->state = NVM_BLK_ST_OPEN; lun->vlun.nr_free_blocks--; - lun->vlun.nr_inuse_blocks++; + lun->vlun.nr_open_blocks++; } } @@ -255,14 +257,14 @@ static void gennvm_unregister(struct nvm_dev *dev) module_put(THIS_MODULE); } -static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, +static struct nvm_block *gennvm_get_blk_unlocked(struct nvm_dev *dev, struct nvm_lun *vlun, unsigned long flags) { struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun); struct nvm_block *blk = NULL; int is_gc = flags & NVM_IOTYPE_GC; - spin_lock(&vlun->lock); + assert_spin_locked(&vlun->lock); if (list_empty(&lun->free_list)) { pr_err_ratelimited("gennvm: lun %u have no free pages available", @@ -275,83 +277,64 @@ static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, blk = list_first_entry(&lun->free_list, struct nvm_block, list); list_move_tail(&blk->list, &lun->used_list); - blk->type = 1; + blk->state = NVM_BLK_ST_OPEN; lun->vlun.nr_free_blocks--; - lun->vlun.nr_inuse_blocks++; + lun->vlun.nr_open_blocks++; out: + return blk; +} + +static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, + struct nvm_lun *vlun, unsigned long flags) +{ + struct nvm_block *blk; + + spin_lock(&vlun->lock); + blk = gennvm_get_blk_unlocked(dev, vlun, flags); spin_unlock(&vlun->lock); return blk; } -static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk) +static void gennvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk) { struct nvm_lun *vlun = blk->lun; struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun); - spin_lock(&vlun->lock); + assert_spin_locked(&vlun->lock); - switch (blk->type) { - case 1: + if (blk->state & NVM_BLK_ST_OPEN) { list_move_tail(&blk->list, &lun->free_list); + lun->vlun.nr_open_blocks--; lun->vlun.nr_free_blocks++; - lun->vlun.nr_inuse_blocks--; - blk->type = 0; - break; - case 2: + blk->state = NVM_BLK_ST_FREE; + } else if (blk->state & NVM_BLK_ST_CLOSED) { + list_move_tail(&blk->list, &lun->free_list); + lun->vlun.nr_closed_blocks--; + lun->vlun.nr_free_blocks++; + blk->state = NVM_BLK_ST_FREE; + } else if (blk->state & NVM_BLK_ST_BAD) { list_move_tail(&blk->list, &lun->bb_list); lun->vlun.nr_bad_blocks++; - lun->vlun.nr_inuse_blocks--; - break; - default: + blk->state = NVM_BLK_ST_BAD; + } else { WARN_ON_ONCE(1); pr_err("gennvm: erroneous block type (%lu -> %u)\n", - blk->id, blk->type); + blk->id, blk->state); list_move_tail(&blk->list, &lun->bb_list); lun->vlun.nr_bad_blocks++; - lun->vlun.nr_inuse_blocks--; - } - - spin_unlock(&vlun->lock); -} - -static void gennvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - int i; - - if (rqd->nr_pages > 1) { - for (i = 0; i < rqd->nr_pages; i++) - rqd->ppa_list[i] = dev_to_generic_addr(dev, - rqd->ppa_list[i]); - } else { - rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr); + blk->state = NVM_BLK_ST_BAD; } } -static void gennvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - int i; - - if (rqd->nr_pages > 1) { - for (i = 0; i < rqd->nr_pages; i++) - rqd->ppa_list[i] = generic_to_dev_addr(dev, - rqd->ppa_list[i]); - } else { - rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr); - } -} - -static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) +static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk) { - if (!dev->ops->submit_io) - return 0; - - /* Convert address space */ - gennvm_generic_to_addr_mode(dev, rqd); + struct nvm_lun *vlun = blk->lun; - rqd->dev = dev; - return dev->ops->submit_io(dev, rqd); + spin_lock(&vlun->lock); + gennvm_put_blk_unlocked(dev, blk); + spin_unlock(&vlun->lock); } static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa, @@ -376,7 +359,7 @@ static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa, blk = &lun->vlun.blocks[ppa->g.blk]; /* will be moved to bb list on put_blk from target */ - blk->type = type; + blk->state = type; } /* mark block bad. It is expected the target recover from the error. */ @@ -390,77 +373,51 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) if (dev->ops->set_bb_tbl(dev, rqd, 1)) return; - gennvm_addr_to_generic_mode(dev, rqd); + nvm_addr_to_generic_mode(dev, rqd); /* look up blocks and mark them as bad */ if (rqd->nr_pages > 1) for (i = 0; i < rqd->nr_pages; i++) - gennvm_blk_set_type(dev, &rqd->ppa_list[i], 2); + gennvm_blk_set_type(dev, &rqd->ppa_list[i], + NVM_BLK_ST_BAD); else - gennvm_blk_set_type(dev, &rqd->ppa_addr, 2); + gennvm_blk_set_type(dev, &rqd->ppa_addr, NVM_BLK_ST_BAD); } -static int gennvm_end_io(struct nvm_rq *rqd, int error) +static void gennvm_end_io(struct nvm_rq *rqd) { struct nvm_tgt_instance *ins = rqd->ins; - int ret = 0; - switch (error) { + switch (rqd->error) { case NVM_RSP_SUCCESS: - break; case NVM_RSP_ERR_EMPTYPAGE: break; case NVM_RSP_ERR_FAILWRITE: gennvm_mark_blk_bad(rqd->dev, rqd); - default: - ret++; } - ret += ins->tt->end_io(rqd, error); - - return ret; + ins->tt->end_io(rqd); } -static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, - unsigned long flags) +static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { - int plane_cnt = 0, pl_idx, ret; - struct ppa_addr addr; - struct nvm_rq rqd; - - if (!dev->ops->erase_block) - return 0; - - addr = block_to_ppa(dev, blk); - - if (dev->plane_mode == NVM_PLANE_SINGLE) { - rqd.nr_pages = 1; - rqd.ppa_addr = addr; - } else { - plane_cnt = (1 << dev->plane_mode); - rqd.nr_pages = plane_cnt; - - rqd.ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, - &rqd.dma_ppa_list); - if (!rqd.ppa_list) { - pr_err("gennvm: failed to allocate dma memory\n"); - return -ENOMEM; - } - - for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { - addr.g.pl = pl_idx; - rqd.ppa_list[pl_idx] = addr; - } - } + if (!dev->ops->submit_io) + return -ENODEV; - gennvm_generic_to_addr_mode(dev, &rqd); + /* Convert address space */ + nvm_generic_to_addr_mode(dev, rqd); - ret = dev->ops->erase_block(dev, &rqd); + rqd->dev = dev; + rqd->end_io = gennvm_end_io; + return dev->ops->submit_io(dev, rqd); +} - if (plane_cnt) - nvm_dev_dma_free(dev, rqd.ppa_list, rqd.dma_ppa_list); +static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, + unsigned long flags) +{ + struct ppa_addr addr = block_to_ppa(dev, blk); - return ret; + return nvm_erase_ppa(dev, &addr, 1); } static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid) @@ -480,10 +437,11 @@ static void gennvm_lun_info_print(struct nvm_dev *dev) gennvm_for_each_lun(gn, lun, i) { spin_lock(&lun->vlun.lock); - pr_info("%s: lun%8u\t%u\t%u\t%u\n", + pr_info("%s: lun%8u\t%u\t%u\t%u\t%u\n", dev->name, i, lun->vlun.nr_free_blocks, - lun->vlun.nr_inuse_blocks, + lun->vlun.nr_open_blocks, + lun->vlun.nr_closed_blocks, lun->vlun.nr_bad_blocks); spin_unlock(&lun->vlun.lock); @@ -491,21 +449,23 @@ static void gennvm_lun_info_print(struct nvm_dev *dev) } static struct nvmm_type gennvm = { - .name = "gennvm", - .version = {0, 1, 0}, + .name = "gennvm", + .version = {0, 1, 0}, + + .register_mgr = gennvm_register, + .unregister_mgr = gennvm_unregister, - .register_mgr = gennvm_register, - .unregister_mgr = gennvm_unregister, + .get_blk_unlocked = gennvm_get_blk_unlocked, + .put_blk_unlocked = gennvm_put_blk_unlocked, - .get_blk = gennvm_get_blk, - .put_blk = gennvm_put_blk, + .get_blk = gennvm_get_blk, + .put_blk = gennvm_put_blk, - .submit_io = gennvm_submit_io, - .end_io = gennvm_end_io, - .erase_blk = gennvm_erase_blk, + .submit_io = gennvm_submit_io, + .erase_blk = gennvm_erase_blk, - .get_lun = gennvm_get_lun, - .lun_info_print = gennvm_lun_info_print, + .get_lun = gennvm_get_lun, + .lun_info_print = gennvm_lun_info_print, }; static int __init gennvm_module_init(void) diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 134e4faba482..d8c75958ced3 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -179,16 +179,23 @@ static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *rblk) static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun, unsigned long flags) { + struct nvm_lun *lun = rlun->parent; struct nvm_block *blk; struct rrpc_block *rblk; - blk = nvm_get_blk(rrpc->dev, rlun->parent, flags); - if (!blk) + spin_lock(&lun->lock); + blk = nvm_get_blk_unlocked(rrpc->dev, rlun->parent, flags); + if (!blk) { + pr_err("nvm: rrpc: cannot get new block from media manager\n"); + spin_unlock(&lun->lock); return NULL; + } rblk = &rlun->blocks[blk->id]; - blk->priv = rblk; + list_add_tail(&rblk->list, &rlun->open_list); + spin_unlock(&lun->lock); + blk->priv = rblk; bitmap_zero(rblk->invalid_pages, rrpc->dev->pgs_per_blk); rblk->next_page = 0; rblk->nr_invalid_pages = 0; @@ -199,7 +206,13 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun, static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk) { - nvm_put_blk(rrpc->dev, rblk->parent); + struct rrpc_lun *rlun = rblk->rlun; + struct nvm_lun *lun = rlun->parent; + + spin_lock(&lun->lock); + nvm_put_blk_unlocked(rrpc->dev, rblk->parent); + list_del(&rblk->list); + spin_unlock(&lun->lock); } static void rrpc_put_blks(struct rrpc *rrpc) @@ -287,6 +300,8 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk) } page = mempool_alloc(rrpc->page_pool, GFP_NOIO); + if (!page) + return -ENOMEM; while ((slot = find_first_zero_bit(rblk->invalid_pages, nr_pgs_per_blk)) < nr_pgs_per_blk) { @@ -328,6 +343,10 @@ try: goto finished; } wait_for_completion_io(&wait); + if (bio->bi_error) { + rrpc_inflight_laddr_release(rrpc, rqd); + goto finished; + } bio_reset(bio); reinit_completion(&wait); @@ -350,6 +369,8 @@ try: wait_for_completion_io(&wait); rrpc_inflight_laddr_release(rrpc, rqd); + if (bio->bi_error) + goto finished; bio_reset(bio); } @@ -373,16 +394,26 @@ static void rrpc_block_gc(struct work_struct *work) struct rrpc *rrpc = gcb->rrpc; struct rrpc_block *rblk = gcb->rblk; struct nvm_dev *dev = rrpc->dev; + struct nvm_lun *lun = rblk->parent->lun; + struct rrpc_lun *rlun = &rrpc->luns[lun->id - rrpc->lun_offset]; + mempool_free(gcb, rrpc->gcb_pool); pr_debug("nvm: block '%lu' being reclaimed\n", rblk->parent->id); if (rrpc_move_valid_pages(rrpc, rblk)) - goto done; + goto put_back; + + if (nvm_erase_blk(dev, rblk->parent)) + goto put_back; - nvm_erase_blk(dev, rblk->parent); rrpc_put_blk(rrpc, rblk); -done: - mempool_free(gcb, rrpc->gcb_pool); + + return; + +put_back: + spin_lock(&rlun->lock); + list_add_tail(&rblk->prio, &rlun->prio_list); + spin_unlock(&rlun->lock); } /* the block with highest number of invalid pages, will be in the beginning @@ -427,7 +458,7 @@ static void rrpc_lun_gc(struct work_struct *work) if (nr_blocks_need < rrpc->nr_luns) nr_blocks_need = rrpc->nr_luns; - spin_lock(&lun->lock); + spin_lock(&rlun->lock); while (nr_blocks_need > lun->nr_free_blocks && !list_empty(&rlun->prio_list)) { struct rrpc_block *rblock = block_prio_find_max(rlun); @@ -436,16 +467,16 @@ static void rrpc_lun_gc(struct work_struct *work) if (!rblock->nr_invalid_pages) break; + gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC); + if (!gcb) + break; + list_del_init(&rblock->prio); BUG_ON(!block_is_full(rrpc, rblock)); pr_debug("rrpc: selected block '%lu' for GC\n", block->id); - gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC); - if (!gcb) - break; - gcb->rrpc = rrpc; gcb->rblk = rblock; INIT_WORK(&gcb->ws_gc, rrpc_block_gc); @@ -454,7 +485,7 @@ static void rrpc_lun_gc(struct work_struct *work) nr_blocks_need--; } - spin_unlock(&lun->lock); + spin_unlock(&rlun->lock); /* TODO: Hint that request queue can be started again */ } @@ -635,12 +666,24 @@ static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd, lun = rblk->parent->lun; cmnt_size = atomic_inc_return(&rblk->data_cmnt_size); - if (unlikely(cmnt_size == rrpc->dev->pgs_per_blk)) + if (unlikely(cmnt_size == rrpc->dev->pgs_per_blk)) { + struct nvm_block *blk = rblk->parent; + struct rrpc_lun *rlun = rblk->rlun; + + spin_lock(&lun->lock); + lun->nr_open_blocks--; + lun->nr_closed_blocks++; + blk->state &= ~NVM_BLK_ST_OPEN; + blk->state |= NVM_BLK_ST_CLOSED; + list_move_tail(&rblk->list, &rlun->closed_list); + spin_unlock(&lun->lock); + rrpc_run_gc(rrpc, rblk); + } } } -static int rrpc_end_io(struct nvm_rq *rqd, int error) +static void rrpc_end_io(struct nvm_rq *rqd) { struct rrpc *rrpc = container_of(rqd->ins, struct rrpc, instance); struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd); @@ -650,11 +693,12 @@ static int rrpc_end_io(struct nvm_rq *rqd, int error) if (bio_data_dir(rqd->bio) == WRITE) rrpc_end_io_write(rrpc, rrqd, laddr, npages); + bio_put(rqd->bio); + if (rrqd->flags & NVM_IOTYPE_GC) - return 0; + return; rrpc_unlock_rq(rrpc, rqd); - bio_put(rqd->bio); if (npages > 1) nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list); @@ -662,8 +706,6 @@ static int rrpc_end_io(struct nvm_rq *rqd, int error) nvm_dev_dma_free(rrpc->dev, rqd->metadata, rqd->dma_metadata); mempool_free(rqd, rrpc->rq_pool); - - return 0; } static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio, @@ -841,6 +883,13 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio, err = nvm_submit_io(rrpc->dev, rqd); if (err) { pr_err("rrpc: I/O submission failed: %d\n", err); + bio_put(bio); + if (!(flags & NVM_IOTYPE_GC)) { + rrpc_unlock_rq(rrpc, rqd); + if (rqd->nr_pages > 1) + nvm_dev_dma_free(rrpc->dev, + rqd->ppa_list, rqd->dma_ppa_list); + } return NVM_IO_ERR; } @@ -1090,6 +1139,11 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) struct rrpc_lun *rlun; int i, j; + if (dev->pgs_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) { + pr_err("rrpc: number of pages per block too high."); + return -EINVAL; + } + spin_lock_init(&rrpc->rev_lock); rrpc->luns = kcalloc(rrpc->nr_luns, sizeof(struct rrpc_lun), @@ -1101,16 +1155,13 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) for (i = 0; i < rrpc->nr_luns; i++) { struct nvm_lun *lun = dev->mt->get_lun(dev, lun_begin + i); - if (dev->pgs_per_blk > - MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) { - pr_err("rrpc: number of pages per block too high."); - goto err; - } - rlun = &rrpc->luns[i]; rlun->rrpc = rrpc; rlun->parent = lun; INIT_LIST_HEAD(&rlun->prio_list); + INIT_LIST_HEAD(&rlun->open_list); + INIT_LIST_HEAD(&rlun->closed_list); + INIT_WORK(&rlun->ws_gc, rrpc_lun_gc); spin_lock_init(&rlun->lock); @@ -1127,6 +1178,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) struct nvm_block *blk = &lun->blocks[j]; rblk->parent = blk; + rblk->rlun = rlun; INIT_LIST_HEAD(&rblk->prio); spin_lock_init(&rblk->lock); } diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h index a9696a06c38c..ef13ac7700c8 100644 --- a/drivers/lightnvm/rrpc.h +++ b/drivers/lightnvm/rrpc.h @@ -54,7 +54,9 @@ struct rrpc_rq { struct rrpc_block { struct nvm_block *parent; + struct rrpc_lun *rlun; struct list_head prio; + struct list_head list; #define MAX_INVALID_PAGES_STORAGE 8 /* Bitmap for invalid page intries */ @@ -73,7 +75,16 @@ struct rrpc_lun { struct nvm_lun *parent; struct rrpc_block *cur, *gc_cur; struct rrpc_block *blocks; /* Reference to block allocation */ - struct list_head prio_list; /* Blocks that may be GC'ed */ + + struct list_head prio_list; /* Blocks that may be GC'ed */ + struct list_head open_list; /* In-use open blocks. These are blocks + * that can be both written to and read + * from + */ + struct list_head closed_list; /* In-use closed blocks. These are + * blocks that can _only_ be read from + */ + struct work_struct ws_gc; spinlock_t lock; diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c new file mode 100644 index 000000000000..321de1f154c5 --- /dev/null +++ b/drivers/lightnvm/sysblk.c @@ -0,0 +1,741 @@ +/* + * Copyright (C) 2015 Matias Bjorling. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include <linux/lightnvm.h> + +#define MAX_SYSBLKS 3 /* remember to update mapping scheme on change */ +#define MAX_BLKS_PR_SYSBLK 2 /* 2 blks with 256 pages and 3000 erases + * enables ~1.5M updates per sysblk unit + */ + +struct sysblk_scan { + /* A row is a collection of flash blocks for a system block. */ + int nr_rows; + int row; + int act_blk[MAX_SYSBLKS]; + + int nr_ppas; + struct ppa_addr ppas[MAX_SYSBLKS * MAX_BLKS_PR_SYSBLK];/* all sysblks */ +}; + +static inline int scan_ppa_idx(int row, int blkid) +{ + return (row * MAX_BLKS_PR_SYSBLK) + blkid; +} + +void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb) +{ + info->seqnr = be32_to_cpu(sb->seqnr); + info->erase_cnt = be32_to_cpu(sb->erase_cnt); + info->version = be16_to_cpu(sb->version); + strncpy(info->mmtype, sb->mmtype, NVM_MMTYPE_LEN); + info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa); +} + +void nvm_cpu_to_sysblk(struct nvm_system_block *sb, struct nvm_sb_info *info) +{ + sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC); + sb->seqnr = cpu_to_be32(info->seqnr); + sb->erase_cnt = cpu_to_be32(info->erase_cnt); + sb->version = cpu_to_be16(info->version); + strncpy(sb->mmtype, info->mmtype, NVM_MMTYPE_LEN); + sb->fs_ppa = cpu_to_be64(info->fs_ppa.ppa); +} + +static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas) +{ + int nr_rows = min_t(int, MAX_SYSBLKS, dev->nr_chnls); + int i; + + for (i = 0; i < nr_rows; i++) + sysblk_ppas[i].ppa = 0; + + /* if possible, place sysblk at first channel, middle channel and last + * channel of the device. If not, create only one or two sys blocks + */ + switch (dev->nr_chnls) { + case 2: + sysblk_ppas[1].g.ch = 1; + /* fall-through */ + case 1: + sysblk_ppas[0].g.ch = 0; + break; + default: + sysblk_ppas[0].g.ch = 0; + sysblk_ppas[1].g.ch = dev->nr_chnls / 2; + sysblk_ppas[2].g.ch = dev->nr_chnls - 1; + break; + } + + return nr_rows; +} + +void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s, + struct ppa_addr *sysblk_ppas) +{ + memset(s, 0, sizeof(struct sysblk_scan)); + s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas); +} + +static int sysblk_get_host_blks(struct ppa_addr ppa, int nr_blks, u8 *blks, + void *private) +{ + struct sysblk_scan *s = private; + int i, nr_sysblk = 0; + + for (i = 0; i < nr_blks; i++) { + if (blks[i] != NVM_BLK_T_HOST) + continue; + + if (s->nr_ppas == MAX_BLKS_PR_SYSBLK * MAX_SYSBLKS) { + pr_err("nvm: too many host blks\n"); + return -EINVAL; + } + + ppa.g.blk = i; + + s->ppas[scan_ppa_idx(s->row, nr_sysblk)] = ppa; + s->nr_ppas++; + nr_sysblk++; + } + + return 0; +} + +static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s, + struct ppa_addr *ppas, nvm_bb_update_fn *fn) +{ + struct ppa_addr dppa; + int i, ret; + + s->nr_ppas = 0; + + for (i = 0; i < s->nr_rows; i++) { + dppa = generic_to_dev_addr(dev, ppas[i]); + s->row = i; + + ret = dev->ops->get_bb_tbl(dev, dppa, dev->blks_per_lun, fn, s); + if (ret) { + pr_err("nvm: failed bb tbl for ppa (%u %u)\n", + ppas[i].g.ch, + ppas[i].g.blk); + return ret; + } + } + + return ret; +} + +/* + * scans a block for latest sysblk. + * Returns: + * 0 - newer sysblk not found. PPA is updated to latest page. + * 1 - newer sysblk found and stored in *cur. PPA is updated to + * next valid page. + * <0- error. + */ +static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa, + struct nvm_system_block *sblk) +{ + struct nvm_system_block *cur; + int pg, cursz, ret, found = 0; + + /* the full buffer for a flash page is allocated. Only the first of it + * contains the system block information + */ + cursz = dev->sec_size * dev->sec_per_pg * dev->nr_planes; + cur = kmalloc(cursz, GFP_KERNEL); + if (!cur) + return -ENOMEM; + + /* perform linear scan through the block */ + for (pg = 0; pg < dev->lps_per_blk; pg++) { + ppa->g.pg = ppa_to_slc(dev, pg); + + ret = nvm_submit_ppa(dev, ppa, 1, NVM_OP_PREAD, NVM_IO_SLC_MODE, + cur, cursz); + if (ret) { + if (ret == NVM_RSP_ERR_EMPTYPAGE) { + pr_debug("nvm: sysblk scan empty ppa (%u %u %u %u)\n", + ppa->g.ch, + ppa->g.lun, + ppa->g.blk, + ppa->g.pg); + break; + } + pr_err("nvm: read failed (%x) for ppa (%u %u %u %u)", + ret, + ppa->g.ch, + ppa->g.lun, + ppa->g.blk, + ppa->g.pg); + break; /* if we can't read a page, continue to the + * next blk + */ + } + + if (be32_to_cpu(cur->magic) != NVM_SYSBLK_MAGIC) { + pr_debug("nvm: scan break for ppa (%u %u %u %u)\n", + ppa->g.ch, + ppa->g.lun, + ppa->g.blk, + ppa->g.pg); + break; /* last valid page already found */ + } + + if (be32_to_cpu(cur->seqnr) < be32_to_cpu(sblk->seqnr)) + continue; + + memcpy(sblk, cur, sizeof(struct nvm_system_block)); + found = 1; + } + + kfree(cur); + + return found; +} + +static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type) +{ + struct nvm_rq rqd; + int ret; + + if (s->nr_ppas > dev->ops->max_phys_sect) { + pr_err("nvm: unable to update all sysblocks atomically\n"); + return -EINVAL; + } + + memset(&rqd, 0, sizeof(struct nvm_rq)); + + nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas); + nvm_generic_to_addr_mode(dev, &rqd); + + ret = dev->ops->set_bb_tbl(dev, &rqd, type); + nvm_free_rqd_ppalist(dev, &rqd); + if (ret) { + pr_err("nvm: sysblk failed bb mark\n"); + return -EINVAL; + } + + return 0; +} + +static int sysblk_get_free_blks(struct ppa_addr ppa, int nr_blks, u8 *blks, + void *private) +{ + struct sysblk_scan *s = private; + struct ppa_addr *sppa; + int i, blkid = 0; + + for (i = 0; i < nr_blks; i++) { + if (blks[i] == NVM_BLK_T_HOST) + return -EEXIST; + + if (blks[i] != NVM_BLK_T_FREE) + continue; + + sppa = &s->ppas[scan_ppa_idx(s->row, blkid)]; + sppa->g.ch = ppa.g.ch; + sppa->g.lun = ppa.g.lun; + sppa->g.blk = i; + s->nr_ppas++; + blkid++; + + pr_debug("nvm: use (%u %u %u) as sysblk\n", + sppa->g.ch, sppa->g.lun, sppa->g.blk); + if (blkid > MAX_BLKS_PR_SYSBLK - 1) + return 0; + } + + pr_err("nvm: sysblk failed get sysblk\n"); + return -EINVAL; +} + +static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info, + struct sysblk_scan *s) +{ + struct nvm_system_block nvmsb; + void *buf; + int i, sect, ret, bufsz; + struct ppa_addr *ppas; + + nvm_cpu_to_sysblk(&nvmsb, info); + + /* buffer for flash page */ + bufsz = dev->sec_size * dev->sec_per_pg * dev->nr_planes; + buf = kzalloc(bufsz, GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, &nvmsb, sizeof(struct nvm_system_block)); + + ppas = kcalloc(dev->sec_per_pg, sizeof(struct ppa_addr), GFP_KERNEL); + if (!ppas) { + ret = -ENOMEM; + goto err; + } + + /* Write and verify */ + for (i = 0; i < s->nr_rows; i++) { + ppas[0] = s->ppas[scan_ppa_idx(i, s->act_blk[i])]; + + pr_debug("nvm: writing sysblk to ppa (%u %u %u %u)\n", + ppas[0].g.ch, + ppas[0].g.lun, + ppas[0].g.blk, + ppas[0].g.pg); + + /* Expand to all sectors within a flash page */ + if (dev->sec_per_pg > 1) { + for (sect = 1; sect < dev->sec_per_pg; sect++) { + ppas[sect].ppa = ppas[0].ppa; + ppas[sect].g.sec = sect; + } + } + + ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PWRITE, + NVM_IO_SLC_MODE, buf, bufsz); + if (ret) { + pr_err("nvm: sysblk failed program (%u %u %u)\n", + ppas[0].g.ch, + ppas[0].g.lun, + ppas[0].g.blk); + break; + } + + ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PREAD, + NVM_IO_SLC_MODE, buf, bufsz); + if (ret) { + pr_err("nvm: sysblk failed read (%u %u %u)\n", + ppas[0].g.ch, + ppas[0].g.lun, + ppas[0].g.blk); + break; + } + + if (memcmp(buf, &nvmsb, sizeof(struct nvm_system_block))) { + pr_err("nvm: sysblk failed verify (%u %u %u)\n", + ppas[0].g.ch, + ppas[0].g.lun, + ppas[0].g.blk); + ret = -EINVAL; + break; + } + } + + kfree(ppas); +err: + kfree(buf); + + return ret; +} + +static int nvm_prepare_new_sysblks(struct nvm_dev *dev, struct sysblk_scan *s) +{ + int i, ret; + unsigned long nxt_blk; + struct ppa_addr *ppa; + + for (i = 0; i < s->nr_rows; i++) { + nxt_blk = (s->act_blk[i] + 1) % MAX_BLKS_PR_SYSBLK; + ppa = &s->ppas[scan_ppa_idx(i, nxt_blk)]; + ppa->g.pg = ppa_to_slc(dev, 0); + + ret = nvm_erase_ppa(dev, ppa, 1); + if (ret) + return ret; + + s->act_blk[i] = nxt_blk; + } + + return 0; +} + +int nvm_get_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info) +{ + struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; + struct sysblk_scan s; + struct nvm_system_block *cur; + int i, j, found = 0; + int ret = -ENOMEM; + + /* + * 1. setup sysblk locations + * 2. get bad block list + * 3. filter on host-specific (type 3) + * 4. iterate through all and find the highest seq nr. + * 5. return superblock information + */ + + if (!dev->ops->get_bb_tbl) + return -EINVAL; + + nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); + + mutex_lock(&dev->mlock); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks); + if (ret) + goto err_sysblk; + + /* no sysblocks initialized */ + if (!s.nr_ppas) + goto err_sysblk; + + cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL); + if (!cur) + goto err_sysblk; + + /* find the latest block across all sysblocks */ + for (i = 0; i < s.nr_rows; i++) { + for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) { + struct ppa_addr ppa = s.ppas[scan_ppa_idx(i, j)]; + + ret = nvm_scan_block(dev, &ppa, cur); + if (ret > 0) + found = 1; + else if (ret < 0) + break; + } + } + + nvm_sysblk_to_cpu(info, cur); + + kfree(cur); +err_sysblk: + mutex_unlock(&dev->mlock); + + if (found) + return 1; + return ret; +} + +int nvm_update_sysblock(struct nvm_dev *dev, struct nvm_sb_info *new) +{ + /* 1. for each latest superblock + * 2. if room + * a. write new flash page entry with the updated information + * 3. if no room + * a. find next available block on lun (linear search) + * if none, continue to next lun + * if none at all, report error. also report that it wasn't + * possible to write to all superblocks. + * c. write data to block. + */ + struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; + struct sysblk_scan s; + struct nvm_system_block *cur; + int i, j, ppaidx, found = 0; + int ret = -ENOMEM; + + if (!dev->ops->get_bb_tbl) + return -EINVAL; + + nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); + + mutex_lock(&dev->mlock); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks); + if (ret) + goto err_sysblk; + + cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL); + if (!cur) + goto err_sysblk; + + /* Get the latest sysblk for each sysblk row */ + for (i = 0; i < s.nr_rows; i++) { + found = 0; + for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) { + ppaidx = scan_ppa_idx(i, j); + ret = nvm_scan_block(dev, &s.ppas[ppaidx], cur); + if (ret > 0) { + s.act_blk[i] = j; + found = 1; + } else if (ret < 0) + break; + } + } + + if (!found) { + pr_err("nvm: no valid sysblks found to update\n"); + ret = -EINVAL; + goto err_cur; + } + + /* + * All sysblocks found. Check that they have same page id in their flash + * blocks + */ + for (i = 1; i < s.nr_rows; i++) { + struct ppa_addr l = s.ppas[scan_ppa_idx(0, s.act_blk[0])]; + struct ppa_addr r = s.ppas[scan_ppa_idx(i, s.act_blk[i])]; + + if (l.g.pg != r.g.pg) { + pr_err("nvm: sysblks not on same page. Previous update failed.\n"); + ret = -EINVAL; + goto err_cur; + } + } + + /* + * Check that there haven't been another update to the seqnr since we + * began + */ + if ((new->seqnr - 1) != be32_to_cpu(cur->seqnr)) { + pr_err("nvm: seq is not sequential\n"); + ret = -EINVAL; + goto err_cur; + } + + /* + * When all pages in a block has been written, a new block is selected + * and writing is performed on the new block. + */ + if (s.ppas[scan_ppa_idx(0, s.act_blk[0])].g.pg == + dev->lps_per_blk - 1) { + ret = nvm_prepare_new_sysblks(dev, &s); + if (ret) + goto err_cur; + } + + ret = nvm_write_and_verify(dev, new, &s); +err_cur: + kfree(cur); +err_sysblk: + mutex_unlock(&dev->mlock); + + return ret; +} + +int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info) +{ + struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; + struct sysblk_scan s; + int ret; + + /* + * 1. select master blocks and select first available blks + * 2. get bad block list + * 3. mark MAX_SYSBLKS block as host-based device allocated. + * 4. write and verify data to block + */ + + if (!dev->ops->get_bb_tbl || !dev->ops->set_bb_tbl) + return -EINVAL; + + if (!(dev->mccap & NVM_ID_CAP_SLC) || !dev->lps_per_blk) { + pr_err("nvm: memory does not support SLC access\n"); + return -EINVAL; + } + + /* Index all sysblocks and mark them as host-driven */ + nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); + + mutex_lock(&dev->mlock); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_free_blks); + if (ret) + goto err_mark; + + ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_HOST); + if (ret) + goto err_mark; + + /* Write to the first block of each row */ + ret = nvm_write_and_verify(dev, info, &s); +err_mark: + mutex_unlock(&dev->mlock); + return ret; +} + +struct factory_blks { + struct nvm_dev *dev; + int flags; + unsigned long *blks; +}; + +static int factory_nblks(int nblks) +{ + /* Round up to nearest BITS_PER_LONG */ + return (nblks + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); +} + +static unsigned int factory_blk_offset(struct nvm_dev *dev, int ch, int lun) +{ + int nblks = factory_nblks(dev->blks_per_lun); + + return ((ch * dev->luns_per_chnl * nblks) + (lun * nblks)) / + BITS_PER_LONG; +} + +static int nvm_factory_blks(struct ppa_addr ppa, int nr_blks, u8 *blks, + void *private) +{ + struct factory_blks *f = private; + struct nvm_dev *dev = f->dev; + int i, lunoff; + + lunoff = factory_blk_offset(dev, ppa.g.ch, ppa.g.lun); + + /* non-set bits correspond to the block must be erased */ + for (i = 0; i < nr_blks; i++) { + switch (blks[i]) { + case NVM_BLK_T_FREE: + if (f->flags & NVM_FACTORY_ERASE_ONLY_USER) + set_bit(i, &f->blks[lunoff]); + break; + case NVM_BLK_T_HOST: + if (!(f->flags & NVM_FACTORY_RESET_HOST_BLKS)) + set_bit(i, &f->blks[lunoff]); + break; + case NVM_BLK_T_GRWN_BAD: + if (!(f->flags & NVM_FACTORY_RESET_GRWN_BBLKS)) + set_bit(i, &f->blks[lunoff]); + break; + default: + set_bit(i, &f->blks[lunoff]); + break; + } + } + + return 0; +} + +static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list, + int max_ppas, struct factory_blks *f) +{ + struct ppa_addr ppa; + int ch, lun, blkid, idx, done = 0, ppa_cnt = 0; + unsigned long *offset; + + while (!done) { + done = 1; + for (ch = 0; ch < dev->nr_chnls; ch++) { + for (lun = 0; lun < dev->luns_per_chnl; lun++) { + idx = factory_blk_offset(dev, ch, lun); + offset = &f->blks[idx]; + + blkid = find_first_zero_bit(offset, + dev->blks_per_lun); + if (blkid >= dev->blks_per_lun) + continue; + set_bit(blkid, offset); + + ppa.ppa = 0; + ppa.g.ch = ch; + ppa.g.lun = lun; + ppa.g.blk = blkid; + pr_debug("nvm: erase ppa (%u %u %u)\n", + ppa.g.ch, + ppa.g.lun, + ppa.g.blk); + + erase_list[ppa_cnt] = ppa; + ppa_cnt++; + done = 0; + + if (ppa_cnt == max_ppas) + return ppa_cnt; + } + } + } + + return ppa_cnt; +} + +static int nvm_fact_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa, + nvm_bb_update_fn *fn, void *priv) +{ + struct ppa_addr dev_ppa; + int ret; + + dev_ppa = generic_to_dev_addr(dev, ppa); + + ret = dev->ops->get_bb_tbl(dev, dev_ppa, dev->blks_per_lun, fn, priv); + if (ret) + pr_err("nvm: failed bb tbl for ch%u lun%u\n", + ppa.g.ch, ppa.g.blk); + return ret; +} + +static int nvm_fact_select_blks(struct nvm_dev *dev, struct factory_blks *f) +{ + int ch, lun, ret; + struct ppa_addr ppa; + + ppa.ppa = 0; + for (ch = 0; ch < dev->nr_chnls; ch++) { + for (lun = 0; lun < dev->luns_per_chnl; lun++) { + ppa.g.ch = ch; + ppa.g.lun = lun; + + ret = nvm_fact_get_bb_tbl(dev, ppa, nvm_factory_blks, + f); + if (ret) + return ret; + } + } + + return 0; +} + +int nvm_dev_factory(struct nvm_dev *dev, int flags) +{ + struct factory_blks f; + struct ppa_addr *ppas; + int ppa_cnt, ret = -ENOMEM; + int max_ppas = dev->ops->max_phys_sect / dev->nr_planes; + struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; + struct sysblk_scan s; + + f.blks = kzalloc(factory_nblks(dev->blks_per_lun) * dev->nr_luns, + GFP_KERNEL); + if (!f.blks) + return ret; + + ppas = kcalloc(max_ppas, sizeof(struct ppa_addr), GFP_KERNEL); + if (!ppas) + goto err_blks; + + f.dev = dev; + f.flags = flags; + + /* create list of blks to be erased */ + ret = nvm_fact_select_blks(dev, &f); + if (ret) + goto err_ppas; + + /* continue to erase until list of blks until empty */ + while ((ppa_cnt = nvm_fact_get_blks(dev, ppas, max_ppas, &f)) > 0) + nvm_erase_ppa(dev, ppas, ppa_cnt); + + /* mark host reserved blocks free */ + if (flags & NVM_FACTORY_RESET_HOST_BLKS) { + nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); + mutex_lock(&dev->mlock); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, + sysblk_get_host_blks); + if (!ret) + ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_FREE); + mutex_unlock(&dev->mlock); + } +err_ppas: + kfree(ppas); +err_blks: + kfree(f.blks); + return ret; +} +EXPORT_SYMBOL(nvm_dev_factory); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 83392f856dfd..22b9e34ceb75 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1741,6 +1741,7 @@ static void bch_btree_gc(struct cache_set *c) do { ret = btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&writes); + cond_resched(); if (ret && ret != -EAGAIN) pr_warn("gc failed!"); @@ -2162,8 +2163,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, rw_lock(true, b, b->level); if (b->key.ptr[0] != btree_ptr || - b->seq != seq + 1) + b->seq != seq + 1) { + op->lock = b->level; goto out; + } } SET_KEY_PTRS(check_key, 1); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 679a093a3bf6..8d0ead98eb6e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -685,6 +685,8 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || sysfs_create_link(&c->kobj, &d->kobj, d->name), "Couldn't create device <-> cache set symlinks"); + + clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); } static void bcache_device_detach(struct bcache_device *d) @@ -847,8 +849,11 @@ void bch_cached_dev_run(struct cached_dev *dc) buf[SB_LABEL_SIZE] = '\0'; env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); - if (atomic_xchg(&dc->running, 1)) + if (atomic_xchg(&dc->running, 1)) { + kfree(env[1]); + kfree(env[2]); return; + } if (!d->c && BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { @@ -1933,6 +1938,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, else err = "device busy"; mutex_unlock(&bch_register_lock); + if (attr == &ksysfs_register_quiet) + goto out; } goto err; } @@ -1971,8 +1978,7 @@ out: err_close: blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); err: - if (attr != &ksysfs_register_quiet) - pr_info("error opening %s: %s", path, err); + pr_info("error opening %s: %s", path, err); ret = -EINVAL; goto out; } @@ -2066,8 +2072,10 @@ static int __init bcache_init(void) closure_debug_init(); bcache_major = register_blkdev(0, "bcache"); - if (bcache_major < 0) + if (bcache_major < 0) { + unregister_reboot_notifier(&reboot); return bcache_major; + } if (!(bcache_wq = create_workqueue("bcache")) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index b23f88d9f18c..b9346cd9cda1 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -323,6 +323,10 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, static bool dirty_pred(struct keybuf *buf, struct bkey *k) { + struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys); + + BUG_ON(KEY_INODE(k) != dc->disk.id); + return KEY_DIRTY(k); } @@ -372,11 +376,24 @@ next: } } +/* + * Returns true if we scanned the entire disk + */ static bool refill_dirty(struct cached_dev *dc) { struct keybuf *buf = &dc->writeback_keys; + struct bkey start = KEY(dc->disk.id, 0, 0); struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); - bool searched_from_start = false; + struct bkey start_pos; + + /* + * make sure keybuf pos is inside the range for this disk - at bringup + * we might not be attached yet so this disk's inode nr isn't + * initialized then + */ + if (bkey_cmp(&buf->last_scanned, &start) < 0 || + bkey_cmp(&buf->last_scanned, &end) > 0) + buf->last_scanned = start; if (dc->partial_stripes_expensive) { refill_full_stripes(dc); @@ -384,14 +401,20 @@ static bool refill_dirty(struct cached_dev *dc) return false; } - if (bkey_cmp(&buf->last_scanned, &end) >= 0) { - buf->last_scanned = KEY(dc->disk.id, 0, 0); - searched_from_start = true; - } - + start_pos = buf->last_scanned; bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); - return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; + if (bkey_cmp(&buf->last_scanned, &end) < 0) + return false; + + /* + * If we get to the end start scanning again from the beginning, and + * only scan up to where we initially started scanning from: + */ + buf->last_scanned = start; + bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred); + + return bkey_cmp(&buf->last_scanned, &start_pos) >= 0; } static int bch_writeback_thread(void *arg) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 0a9dab187b79..073a042aed24 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -63,7 +63,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, static inline void bch_writeback_queue(struct cached_dev *dc) { - wake_up_process(dc->writeback_thread); + if (!IS_ERR_OR_NULL(dc->writeback_thread)) + wake_up_process(dc->writeback_thread); } static inline void bch_writeback_add(struct cached_dev *dc) diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 002a94abdbc4..5d6237391dcd 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -8,3 +8,14 @@ config BLK_DEV_NVME To compile this driver as a module, choose M here: the module will be called nvme. + +config BLK_DEV_NVME_SCSI + bool "SCSI emulation for NVMe device nodes" + depends on BLK_DEV_NVME + ---help--- + This adds support for the SG_IO ioctl on the NVMe character + and block devices nodes, as well a a translation for a small + number of selected SCSI commands to NVMe commands to the NVMe + driver. If you don't know what this means you probably want + to say N here, and if you know what it means you probably + want to say N as well. diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index a5fe23952586..51bf90871549 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_BLK_DEV_NVME) += nvme.o -lightnvm-$(CONFIG_NVM) := lightnvm.o -nvme-y += pci.o scsi.o $(lightnvm-y) +lightnvm-$(CONFIG_NVM) := lightnvm.o +nvme-y += core.o pci.o $(lightnvm-y) +nvme-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c new file mode 100644 index 000000000000..c5bf001af559 --- /dev/null +++ b/drivers/nvme/host/core.c @@ -0,0 +1,1472 @@ +/* + * NVM Express device driver + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/blkdev.h> +#include <linux/blk-mq.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/hdreg.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/list_sort.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/pr.h> +#include <linux/ptrace.h> +#include <linux/nvme_ioctl.h> +#include <linux/t10-pi.h> +#include <scsi/sg.h> +#include <asm/unaligned.h> + +#include "nvme.h" + +#define NVME_MINORS (1U << MINORBITS) + +static int nvme_major; +module_param(nvme_major, int, 0); + +static int nvme_char_major; +module_param(nvme_char_major, int, 0); + +static LIST_HEAD(nvme_ctrl_list); +DEFINE_SPINLOCK(dev_list_lock); + +static struct class *nvme_class; + +static void nvme_free_ns(struct kref *kref) +{ + struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); + + if (ns->type == NVME_NS_LIGHTNVM) + nvme_nvm_unregister(ns->queue, ns->disk->disk_name); + + spin_lock(&dev_list_lock); + ns->disk->private_data = NULL; + spin_unlock(&dev_list_lock); + + nvme_put_ctrl(ns->ctrl); + put_disk(ns->disk); + kfree(ns); +} + +static void nvme_put_ns(struct nvme_ns *ns) +{ + kref_put(&ns->kref, nvme_free_ns); +} + +static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk) +{ + struct nvme_ns *ns; + + spin_lock(&dev_list_lock); + ns = disk->private_data; + if (ns && !kref_get_unless_zero(&ns->kref)) + ns = NULL; + spin_unlock(&dev_list_lock); + + return ns; +} + +void nvme_requeue_req(struct request *req) +{ + unsigned long flags; + + blk_mq_requeue_request(req); + spin_lock_irqsave(req->q->queue_lock, flags); + if (!blk_queue_stopped(req->q)) + blk_mq_kick_requeue_list(req->q); + spin_unlock_irqrestore(req->q->queue_lock, flags); +} + +struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, unsigned int flags) +{ + bool write = cmd->common.opcode & 1; + struct request *req; + + req = blk_mq_alloc_request(q, write, flags); + if (IS_ERR(req)) + return req; + + req->cmd_type = REQ_TYPE_DRV_PRIV; + req->cmd_flags |= REQ_FAILFAST_DRIVER; + req->__data_len = 0; + req->__sector = (sector_t) -1; + req->bio = req->biotail = NULL; + + req->cmd = (unsigned char *)cmd; + req->cmd_len = sizeof(struct nvme_command); + req->special = (void *)0; + + return req; +} + +/* + * Returns 0 on success. If the result is negative, it's a Linux error code; + * if the result is positive, it's an NVM Express status code + */ +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, unsigned bufflen, u32 *result, unsigned timeout) +{ + struct request *req; + int ret; + + req = nvme_alloc_request(q, cmd, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + + if (buffer && bufflen) { + ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); + if (ret) + goto out; + } + + blk_execute_rq(req->q, NULL, req, 0); + if (result) + *result = (u32)(uintptr_t)req->special; + ret = req->errors; + out: + blk_mq_free_request(req); + return ret; +} + +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, unsigned bufflen) +{ + return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0); +} + +int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, + void __user *meta_buffer, unsigned meta_len, u32 meta_seed, + u32 *result, unsigned timeout) +{ + bool write = cmd->common.opcode & 1; + struct nvme_ns *ns = q->queuedata; + struct gendisk *disk = ns ? ns->disk : NULL; + struct request *req; + struct bio *bio = NULL; + void *meta = NULL; + int ret; + + req = nvme_alloc_request(q, cmd, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + + if (ubuffer && bufflen) { + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + GFP_KERNEL); + if (ret) + goto out; + bio = req->bio; + + if (!disk) + goto submit; + bio->bi_bdev = bdget_disk(disk, 0); + if (!bio->bi_bdev) { + ret = -ENODEV; + goto out_unmap; + } + + if (meta_buffer) { + struct bio_integrity_payload *bip; + + meta = kmalloc(meta_len, GFP_KERNEL); + if (!meta) { + ret = -ENOMEM; + goto out_unmap; + } + + if (write) { + if (copy_from_user(meta, meta_buffer, + meta_len)) { + ret = -EFAULT; + goto out_free_meta; + } + } + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto out_free_meta; + } + + bip->bip_iter.bi_size = meta_len; + bip->bip_iter.bi_sector = meta_seed; + + ret = bio_integrity_add_page(bio, virt_to_page(meta), + meta_len, offset_in_page(meta)); + if (ret != meta_len) { + ret = -ENOMEM; + goto out_free_meta; + } + } + } + submit: + blk_execute_rq(req->q, disk, req, 0); + ret = req->errors; + if (result) + *result = (u32)(uintptr_t)req->special; + if (meta && !ret && !write) { + if (copy_to_user(meta_buffer, meta, meta_len)) + ret = -EFAULT; + } + out_free_meta: + kfree(meta); + out_unmap: + if (bio) { + if (disk && bio->bi_bdev) + bdput(bio->bi_bdev); + blk_rq_unmap_user(bio); + } + out: + blk_mq_free_request(req); + return ret; +} + +int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, u32 *result, + unsigned timeout) +{ + return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0, + result, timeout); +} + +int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) +{ + struct nvme_command c = { }; + int error; + + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify; + c.identify.cns = cpu_to_le32(1); + + *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, + sizeof(struct nvme_id_ctrl)); + if (error) + kfree(*id); + return error; +} + +static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) +{ + struct nvme_command c = { }; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = cpu_to_le32(2); + c.identify.nsid = cpu_to_le32(nsid); + return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); +} + +int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, + struct nvme_id_ns **id) +{ + struct nvme_command c = { }; + int error; + + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify, + c.identify.nsid = cpu_to_le32(nsid), + + *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, + sizeof(struct nvme_id_ns)); + if (error) + kfree(*id); + return error; +} + +int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, + dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_get_features; + c.features.nsid = cpu_to_le32(nsid); + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0); +} + +int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, + dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_set_features; + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + c.features.dword11 = cpu_to_le32(dword11); + + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0); +} + +int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) +{ + struct nvme_command c = { }; + int error; + + c.common.opcode = nvme_admin_get_log_page, + c.common.nsid = cpu_to_le32(0xFFFFFFFF), + c.common.cdw10[0] = cpu_to_le32( + (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | + NVME_LOG_SMART), + + *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); + if (!*log) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, + sizeof(struct nvme_smart_log)); + if (error) + kfree(*log); + return error; +} + +int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) +{ + u32 q_count = (*count - 1) | ((*count - 1) << 16); + u32 result; + int status, nr_io_queues; + + status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, + &result); + if (status) + return status; + + nr_io_queues = min(result & 0xffff, result >> 16) + 1; + *count = min(*count, nr_io_queues); + return 0; +} + +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) +{ + struct nvme_user_io io; + struct nvme_command c; + unsigned length, meta_len; + void __user *metadata; + + if (copy_from_user(&io, uio, sizeof(io))) + return -EFAULT; + + switch (io.opcode) { + case nvme_cmd_write: + case nvme_cmd_read: + case nvme_cmd_compare: + break; + default: + return -EINVAL; + } + + length = (io.nblocks + 1) << ns->lba_shift; + meta_len = (io.nblocks + 1) * ns->ms; + metadata = (void __user *)(uintptr_t)io.metadata; + + if (ns->ext) { + length += meta_len; + meta_len = 0; + } else if (meta_len) { + if ((io.metadata & 3) || !io.metadata) + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.rw.opcode = io.opcode; + c.rw.flags = io.flags; + c.rw.nsid = cpu_to_le32(ns->ns_id); + c.rw.slba = cpu_to_le64(io.slba); + c.rw.length = cpu_to_le16(io.nblocks); + c.rw.control = cpu_to_le16(io.control); + c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); + c.rw.reftag = cpu_to_le32(io.reftag); + c.rw.apptag = cpu_to_le16(io.apptag); + c.rw.appmask = cpu_to_le16(io.appmask); + + return __nvme_submit_user_cmd(ns->queue, &c, + (void __user *)(uintptr_t)io.addr, length, + metadata, meta_len, io.slba, NULL, 0); +} + +static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd) +{ + struct nvme_passthru_cmd cmd; + struct nvme_command c; + unsigned timeout = 0; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); + c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); + c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); + c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); + c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); + c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + (void __user *)(uintptr_t)cmd.addr, cmd.data_len, + &cmd.result, timeout); + if (status >= 0) { + if (put_user(cmd.result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + + switch (cmd) { + case NVME_IOCTL_ID: + force_successful_syscall_return(); + return ns->ns_id; + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); + case NVME_IOCTL_SUBMIT_IO: + return nvme_submit_io(ns, (void __user *)arg); +#ifdef CONFIG_BLK_DEV_NVME_SCSI + case SG_GET_VERSION_NUM: + return nvme_sg_get_version_num((void __user *)arg); + case SG_IO: + return nvme_sg_io(ns, (void __user *)arg); +#endif + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case SG_IO: + return -ENOIOCTLCMD; + } + return nvme_ioctl(bdev, mode, cmd, arg); +} +#else +#define nvme_compat_ioctl NULL +#endif + +static int nvme_open(struct block_device *bdev, fmode_t mode) +{ + return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO; +} + +static void nvme_release(struct gendisk *disk, fmode_t mode) +{ + nvme_put_ns(disk->private_data); +} + +static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + /* some standard values */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bdev->bd_disk) >> 11; + return 0; +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static void nvme_init_integrity(struct nvme_ns *ns) +{ + struct blk_integrity integrity; + + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + integrity.profile = &t10_pi_type3_crc; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + integrity.profile = &t10_pi_type1_crc; + break; + default: + integrity.profile = NULL; + break; + } + integrity.tuple_size = ns->ms; + blk_integrity_register(ns->disk, &integrity); + blk_queue_max_integrity_segments(ns->queue, 1); +} +#else +static void nvme_init_integrity(struct nvme_ns *ns) +{ +} +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + +static void nvme_config_discard(struct nvme_ns *ns) +{ + u32 logical_block_size = queue_logical_block_size(ns->queue); + ns->queue->limits.discard_zeroes_data = 0; + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + blk_queue_max_discard_sectors(ns->queue, 0xffffffff); + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); +} + +static int nvme_revalidate_disk(struct gendisk *disk) +{ + struct nvme_ns *ns = disk->private_data; + struct nvme_id_ns *id; + u8 lbaf, pi_type; + u16 old_ms; + unsigned short bs; + + if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { + dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n", + __func__, ns->ctrl->instance, ns->ns_id); + return -ENODEV; + } + if (id->ncap == 0) { + kfree(id); + return -ENODEV; + } + + if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { + if (nvme_nvm_register(ns->queue, disk->disk_name)) { + dev_warn(ns->ctrl->dev, + "%s: LightNVM init failure\n", __func__); + kfree(id); + return -ENODEV; + } + ns->type = NVME_NS_LIGHTNVM; + } + + if (ns->ctrl->vs >= NVME_VS(1, 1)) + memcpy(ns->eui, id->eui64, sizeof(ns->eui)); + if (ns->ctrl->vs >= NVME_VS(1, 2)) + memcpy(ns->uuid, id->nguid, sizeof(ns->uuid)); + + old_ms = ns->ms; + lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; + ns->lba_shift = id->lbaf[lbaf].ds; + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); + ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); + + /* + * If identify namespace failed, use default 512 byte block size so + * block layer can use before failing read/write for 0 capacity. + */ + if (ns->lba_shift == 0) + ns->lba_shift = 9; + bs = 1 << ns->lba_shift; + /* XXX: PI implementation requires metadata equal t10 pi tuple size */ + pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? + id->dps & NVME_NS_DPS_PI_MASK : 0; + + blk_mq_freeze_queue(disk->queue); + if (blk_get_integrity(disk) && (ns->pi_type != pi_type || + ns->ms != old_ms || + bs != queue_logical_block_size(disk->queue) || + (ns->ms && ns->ext))) + blk_integrity_unregister(disk); + + ns->pi_type = pi_type; + blk_queue_logical_block_size(ns->queue, bs); + + if (ns->ms && !blk_get_integrity(disk) && !ns->ext) + nvme_init_integrity(ns); + if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) + set_capacity(disk, 0); + else + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + + if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) + nvme_config_discard(ns); + blk_mq_unfreeze_queue(disk->queue); + + kfree(id); + return 0; +} + +static char nvme_pr_type(enum pr_type type) +{ + switch (type) { + case PR_WRITE_EXCLUSIVE: + return 1; + case PR_EXCLUSIVE_ACCESS: + return 2; + case PR_WRITE_EXCLUSIVE_REG_ONLY: + return 3; + case PR_EXCLUSIVE_ACCESS_REG_ONLY: + return 4; + case PR_WRITE_EXCLUSIVE_ALL_REGS: + return 5; + case PR_EXCLUSIVE_ACCESS_ALL_REGS: + return 6; + default: + return 0; + } +}; + +static int nvme_pr_command(struct block_device *bdev, u32 cdw10, + u64 key, u64 sa_key, u8 op) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + struct nvme_command c; + u8 data[16] = { 0, }; + + put_unaligned_le64(key, &data[0]); + put_unaligned_le64(sa_key, &data[8]); + + memset(&c, 0, sizeof(c)); + c.common.opcode = op; + c.common.nsid = cpu_to_le32(ns->ns_id); + c.common.cdw10[0] = cpu_to_le32(cdw10); + + return nvme_submit_sync_cmd(ns->queue, &c, data, 16); +} + +static int nvme_pr_register(struct block_device *bdev, u64 old, + u64 new, unsigned flags) +{ + u32 cdw10; + + if (flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + + cdw10 = old ? 2 : 0; + cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; + cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); +} + +static int nvme_pr_reserve(struct block_device *bdev, u64 key, + enum pr_type type, unsigned flags) +{ + u32 cdw10; + + if (flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + + cdw10 = nvme_pr_type(type) << 8; + cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); +} + +static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, + enum pr_type type, bool abort) +{ + u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1; + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); +} + +static int nvme_pr_clear(struct block_device *bdev, u64 key) +{ + u32 cdw10 = 1 | (key ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); +} + +static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) +{ + u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0; + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); +} + +static const struct pr_ops nvme_pr_ops = { + .pr_register = nvme_pr_register, + .pr_reserve = nvme_pr_reserve, + .pr_release = nvme_pr_release, + .pr_preempt = nvme_pr_preempt, + .pr_clear = nvme_pr_clear, +}; + +static const struct block_device_operations nvme_fops = { + .owner = THIS_MODULE, + .ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, + .open = nvme_open, + .release = nvme_release, + .getgeo = nvme_getgeo, + .revalidate_disk= nvme_revalidate_disk, + .pr_ops = &nvme_pr_ops, +}; + +static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) +{ + unsigned long timeout = + ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; + u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; + int ret; + + while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { + if ((csts & NVME_CSTS_RDY) == bit) + break; + + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(ctrl->dev, + "Device not ready; aborting %s\n", enabled ? + "initialisation" : "reset"); + return -ENODEV; + } + } + + return ret; +} + +/* + * If the device has been passed off to us in an enabled state, just clear + * the enabled bit. The spec says we should set the 'shutdown notification + * bits', but doing so may cause the device to complete commands to the + * admin queue ... and we don't know what memory that might be pointing at! + */ +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +{ + int ret; + + ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; + ctrl->ctrl_config &= ~NVME_CC_ENABLE; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + return nvme_wait_ready(ctrl, cap, false); +} + +int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +{ + /* + * Default to a 4K page size, with the intention to update this + * path in the future to accomodate architectures with differing + * kernel and IO page sizes. + */ + unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; + int ret; + + if (page_shift < dev_page_min) { + dev_err(ctrl->dev, + "Minimum device page size %u too large for host (%u)\n", + 1 << dev_page_min, 1 << page_shift); + return -ENODEV; + } + + ctrl->page_size = 1 << page_shift; + + ctrl->ctrl_config = NVME_CC_CSS_NVM; + ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; + ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; + ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; + ctrl->ctrl_config |= NVME_CC_ENABLE; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + return nvme_wait_ready(ctrl, cap, true); +} + +int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) +{ + unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies; + u32 csts; + int ret; + + ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; + ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + + while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { + if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) + break; + + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(ctrl->dev, + "Device shutdown incomplete; abort shutdown\n"); + return -ENODEV; + } + } + + return ret; +} + +/* + * Initialize the cached copies of the Identify data and various controller + * register in our nvme_ctrl structure. This should be called as soon as + * the admin queue is fully up and running. + */ +int nvme_init_identify(struct nvme_ctrl *ctrl) +{ + struct nvme_id_ctrl *id; + u64 cap; + int ret, page_shift; + + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); + if (ret) { + dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret); + return ret; + } + + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); + if (ret) { + dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret); + return ret; + } + page_shift = NVME_CAP_MPSMIN(cap) + 12; + + if (ctrl->vs >= NVME_VS(1, 1)) + ctrl->subsystem = NVME_CAP_NSSRC(cap); + + ret = nvme_identify_ctrl(ctrl, &id); + if (ret) { + dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret); + return -EIO; + } + + ctrl->oncs = le16_to_cpup(&id->oncs); + atomic_set(&ctrl->abort_limit, id->acl + 1); + ctrl->vwc = id->vwc; + memcpy(ctrl->serial, id->sn, sizeof(id->sn)); + memcpy(ctrl->model, id->mn, sizeof(id->mn)); + memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); + if (id->mdts) + ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9); + else + ctrl->max_hw_sectors = UINT_MAX; + + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) { + unsigned int max_hw_sectors; + + ctrl->stripe_size = 1 << (id->vs[3] + page_shift); + max_hw_sectors = ctrl->stripe_size >> (page_shift - 9); + if (ctrl->max_hw_sectors) { + ctrl->max_hw_sectors = min(max_hw_sectors, + ctrl->max_hw_sectors); + } else { + ctrl->max_hw_sectors = max_hw_sectors; + } + } + + kfree(id); + return 0; +} + +static int nvme_dev_open(struct inode *inode, struct file *file) +{ + struct nvme_ctrl *ctrl; + int instance = iminor(inode); + int ret = -ENODEV; + + spin_lock(&dev_list_lock); + list_for_each_entry(ctrl, &nvme_ctrl_list, node) { + if (ctrl->instance != instance) + continue; + + if (!ctrl->admin_q) { + ret = -EWOULDBLOCK; + break; + } + if (!kref_get_unless_zero(&ctrl->kref)) + break; + file->private_data = ctrl; + ret = 0; + break; + } + spin_unlock(&dev_list_lock); + + return ret; +} + +static int nvme_dev_release(struct inode *inode, struct file *file) +{ + nvme_put_ctrl(file->private_data); + return 0; +} + +static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) +{ + struct nvme_ns *ns; + int ret; + + mutex_lock(&ctrl->namespaces_mutex); + if (list_empty(&ctrl->namespaces)) { + ret = -ENOTTY; + goto out_unlock; + } + + ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { + dev_warn(ctrl->dev, + "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); + ret = -EINVAL; + goto out_unlock; + } + + dev_warn(ctrl->dev, + "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); + kref_get(&ns->kref); + mutex_unlock(&ctrl->namespaces_mutex); + + ret = nvme_user_cmd(ctrl, ns, argp); + nvme_put_ns(ns); + return ret; + +out_unlock: + mutex_unlock(&ctrl->namespaces_mutex); + return ret; +} + +static long nvme_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct nvme_ctrl *ctrl = file->private_data; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_IO_CMD: + return nvme_dev_user_cmd(ctrl, argp); + case NVME_IOCTL_RESET: + dev_warn(ctrl->dev, "resetting controller\n"); + return ctrl->ops->reset_ctrl(ctrl); + case NVME_IOCTL_SUBSYS_RESET: + return nvme_reset_subsystem(ctrl); + default: + return -ENOTTY; + } +} + +static const struct file_operations nvme_dev_fops = { + .owner = THIS_MODULE, + .open = nvme_dev_open, + .release = nvme_dev_release, + .unlocked_ioctl = nvme_dev_ioctl, + .compat_ioctl = nvme_dev_ioctl, +}; + +static ssize_t nvme_sysfs_reset(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + int ret; + + ret = ctrl->ops->reset_ctrl(ctrl); + if (ret < 0) + return ret; + return count; +} +static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); + +static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + return sprintf(buf, "%pU\n", ns->uuid); +} +static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); + +static ssize_t eui_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + return sprintf(buf, "%8phd\n", ns->eui); +} +static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); + +static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + return sprintf(buf, "%d\n", ns->ns_id); +} +static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); + +static struct attribute *nvme_ns_attrs[] = { + &dev_attr_uuid.attr, + &dev_attr_eui.attr, + &dev_attr_nsid.attr, + NULL, +}; + +static umode_t nvme_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + + if (a == &dev_attr_uuid.attr) { + if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) + return 0; + } + if (a == &dev_attr_eui.attr) { + if (!memchr_inv(ns->eui, 0, sizeof(ns->eui))) + return 0; + } + return a->mode; +} + +static const struct attribute_group nvme_ns_attr_group = { + .attrs = nvme_ns_attrs, + .is_visible = nvme_attrs_are_visible, +}; + +#define nvme_show_function(field) \ +static ssize_t field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ + return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \ +} \ +static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); + +nvme_show_function(model); +nvme_show_function(serial); +nvme_show_function(firmware_rev); + +static struct attribute *nvme_dev_attrs[] = { + &dev_attr_reset_controller.attr, + &dev_attr_model.attr, + &dev_attr_serial.attr, + &dev_attr_firmware_rev.attr, + NULL +}; + +static struct attribute_group nvme_dev_attrs_group = { + .attrs = nvme_dev_attrs, +}; + +static const struct attribute_group *nvme_dev_attr_groups[] = { + &nvme_dev_attrs_group, + NULL, +}; + +static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); + struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); + + return nsa->ns_id - nsb->ns_id; +} + +static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns; + + lockdep_assert_held(&ctrl->namespaces_mutex); + + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (ns->ns_id == nsid) + return ns; + if (ns->ns_id > nsid) + break; + } + return NULL; +} + +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns; + struct gendisk *disk; + int node = dev_to_node(ctrl->dev); + + lockdep_assert_held(&ctrl->namespaces_mutex); + + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); + if (!ns) + return; + + ns->queue = blk_mq_init_queue(ctrl->tagset); + if (IS_ERR(ns->queue)) + goto out_free_ns; + queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); + ns->queue->queuedata = ns; + ns->ctrl = ctrl; + + disk = alloc_disk_node(0, node); + if (!disk) + goto out_free_queue; + + kref_init(&ns->kref); + ns->ns_id = nsid; + ns->disk = disk; + ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ + + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); + if (ctrl->max_hw_sectors) { + blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors); + blk_queue_max_segments(ns->queue, + (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1); + } + if (ctrl->stripe_size) + blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9); + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); + blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1); + + disk->major = nvme_major; + disk->first_minor = 0; + disk->fops = &nvme_fops; + disk->private_data = ns; + disk->queue = ns->queue; + disk->driverfs_dev = ctrl->device; + disk->flags = GENHD_FL_EXT_DEVT; + sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid); + + if (nvme_revalidate_disk(ns->disk)) + goto out_free_disk; + + list_add_tail(&ns->list, &ctrl->namespaces); + kref_get(&ctrl->kref); + if (ns->type == NVME_NS_LIGHTNVM) + return; + + add_disk(ns->disk); + if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, + &nvme_ns_attr_group)) + pr_warn("%s: failed to create sysfs group for identification\n", + ns->disk->disk_name); + return; + out_free_disk: + kfree(disk); + out_free_queue: + blk_cleanup_queue(ns->queue); + out_free_ns: + kfree(ns); +} + +static void nvme_ns_remove(struct nvme_ns *ns) +{ + bool kill = nvme_io_incapable(ns->ctrl) && + !blk_queue_dying(ns->queue); + + lockdep_assert_held(&ns->ctrl->namespaces_mutex); + + if (kill) { + blk_set_queue_dying(ns->queue); + + /* + * The controller was shutdown first if we got here through + * device removal. The shutdown may requeue outstanding + * requests. These need to be aborted immediately so + * del_gendisk doesn't block indefinitely for their completion. + */ + blk_mq_abort_requeue_list(ns->queue); + } + if (ns->disk->flags & GENHD_FL_UP) { + if (blk_get_integrity(ns->disk)) + blk_integrity_unregister(ns->disk); + sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, + &nvme_ns_attr_group); + del_gendisk(ns->disk); + } + if (kill || !blk_queue_dying(ns->queue)) { + blk_mq_abort_requeue_list(ns->queue); + blk_cleanup_queue(ns->queue); + } + list_del_init(&ns->list); + nvme_put_ns(ns); +} + +static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns; + + ns = nvme_find_ns(ctrl, nsid); + if (ns) { + if (revalidate_disk(ns->disk)) + nvme_ns_remove(ns); + } else + nvme_alloc_ns(ctrl, nsid); +} + +static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) +{ + struct nvme_ns *ns; + __le32 *ns_list; + unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024); + int ret = 0; + + ns_list = kzalloc(0x1000, GFP_KERNEL); + if (!ns_list) + return -ENOMEM; + + for (i = 0; i < num_lists; i++) { + ret = nvme_identify_ns_list(ctrl, prev, ns_list); + if (ret) + goto out; + + for (j = 0; j < min(nn, 1024U); j++) { + nsid = le32_to_cpu(ns_list[j]); + if (!nsid) + goto out; + + nvme_validate_ns(ctrl, nsid); + + while (++prev < nsid) { + ns = nvme_find_ns(ctrl, prev); + if (ns) + nvme_ns_remove(ns); + } + } + nn -= j; + } + out: + kfree(ns_list); + return ret; +} + +static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn) +{ + struct nvme_ns *ns, *next; + unsigned i; + + lockdep_assert_held(&ctrl->namespaces_mutex); + + for (i = 1; i <= nn; i++) + nvme_validate_ns(ctrl, i); + + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { + if (ns->ns_id > nn) + nvme_ns_remove(ns); + } +} + +void nvme_scan_namespaces(struct nvme_ctrl *ctrl) +{ + struct nvme_id_ctrl *id; + unsigned nn; + + if (nvme_identify_ctrl(ctrl, &id)) + return; + + mutex_lock(&ctrl->namespaces_mutex); + nn = le32_to_cpu(id->nn); + if (ctrl->vs >= NVME_VS(1, 1) && + !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { + if (!nvme_scan_ns_list(ctrl, nn)) + goto done; + } + __nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn)); + done: + list_sort(NULL, &ctrl->namespaces, ns_cmp); + mutex_unlock(&ctrl->namespaces_mutex); + kfree(id); +} + +void nvme_remove_namespaces(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns, *next; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) + nvme_ns_remove(ns); + mutex_unlock(&ctrl->namespaces_mutex); +} + +static DEFINE_IDA(nvme_instance_ida); + +static int nvme_set_instance(struct nvme_ctrl *ctrl) +{ + int instance, error; + + do { + if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) + return -ENODEV; + + spin_lock(&dev_list_lock); + error = ida_get_new(&nvme_instance_ida, &instance); + spin_unlock(&dev_list_lock); + } while (error == -EAGAIN); + + if (error) + return -ENODEV; + + ctrl->instance = instance; + return 0; +} + +static void nvme_release_instance(struct nvme_ctrl *ctrl) +{ + spin_lock(&dev_list_lock); + ida_remove(&nvme_instance_ida, ctrl->instance); + spin_unlock(&dev_list_lock); +} + +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) + { + device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); + + spin_lock(&dev_list_lock); + list_del(&ctrl->node); + spin_unlock(&dev_list_lock); +} + +static void nvme_free_ctrl(struct kref *kref) +{ + struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); + + put_device(ctrl->device); + nvme_release_instance(ctrl); + + ctrl->ops->free_ctrl(ctrl); +} + +void nvme_put_ctrl(struct nvme_ctrl *ctrl) +{ + kref_put(&ctrl->kref, nvme_free_ctrl); +} + +/* + * Initialize a NVMe controller structures. This needs to be called during + * earliest initialization so that we have the initialized structured around + * during probing. + */ +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, + const struct nvme_ctrl_ops *ops, unsigned long quirks) +{ + int ret; + + INIT_LIST_HEAD(&ctrl->namespaces); + mutex_init(&ctrl->namespaces_mutex); + kref_init(&ctrl->kref); + ctrl->dev = dev; + ctrl->ops = ops; + ctrl->quirks = quirks; + + ret = nvme_set_instance(ctrl); + if (ret) + goto out; + + ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, + MKDEV(nvme_char_major, ctrl->instance), + dev, nvme_dev_attr_groups, + "nvme%d", ctrl->instance); + if (IS_ERR(ctrl->device)) { + ret = PTR_ERR(ctrl->device); + goto out_release_instance; + } + get_device(ctrl->device); + dev_set_drvdata(ctrl->device, ctrl); + + spin_lock(&dev_list_lock); + list_add_tail(&ctrl->node, &nvme_ctrl_list); + spin_unlock(&dev_list_lock); + + return 0; +out_release_instance: + nvme_release_instance(ctrl); +out: + return ret; +} + +void nvme_stop_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + spin_lock_irq(ns->queue->queue_lock); + queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); + spin_unlock_irq(ns->queue->queue_lock); + + blk_mq_cancel_requeue_work(ns->queue); + blk_mq_stop_hw_queues(ns->queue); + } + mutex_unlock(&ctrl->namespaces_mutex); +} + +void nvme_start_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); + blk_mq_start_stopped_hw_queues(ns->queue, true); + blk_mq_kick_requeue_list(ns->queue); + } + mutex_unlock(&ctrl->namespaces_mutex); +} + +int __init nvme_core_init(void) +{ + int result; + + result = register_blkdev(nvme_major, "nvme"); + if (result < 0) + return result; + else if (result > 0) + nvme_major = result; + + result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", + &nvme_dev_fops); + if (result < 0) + goto unregister_blkdev; + else if (result > 0) + nvme_char_major = result; + + nvme_class = class_create(THIS_MODULE, "nvme"); + if (IS_ERR(nvme_class)) { + result = PTR_ERR(nvme_class); + goto unregister_chrdev; + } + + return 0; + + unregister_chrdev: + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + unregister_blkdev: + unregister_blkdev(nvme_major, "nvme"); + return result; +} + +void nvme_core_exit(void) +{ + unregister_blkdev(nvme_major, "nvme"); + class_destroy(nvme_class); + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); +} diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 1af54ea20e7b..5cd3725e2fa4 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -146,6 +146,16 @@ struct nvme_nvm_command { }; }; +struct nvme_nvm_lp_mlc { + __u16 num_pairs; + __u8 pairs[886]; +}; + +struct nvme_nvm_lp_tbl { + __u8 id[8]; + struct nvme_nvm_lp_mlc mlc; +}; + struct nvme_nvm_id_group { __u8 mtype; __u8 fmtype; @@ -169,7 +179,8 @@ struct nvme_nvm_id_group { __le32 mpos; __le32 mccap; __le16 cpar; - __u8 reserved[906]; + __u8 reserved[10]; + struct nvme_nvm_lp_tbl lptbl; } __packed; struct nvme_nvm_addr_format { @@ -266,6 +277,15 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) dst->mccap = le32_to_cpu(src->mccap); dst->cpar = le16_to_cpu(src->cpar); + + if (dst->fmtype == NVM_ID_FMTYPE_MLC) { + memcpy(dst->lptbl.id, src->lptbl.id, 8); + dst->lptbl.mlc.num_pairs = + le16_to_cpu(src->lptbl.mlc.num_pairs); + /* 4 bits per pair */ + memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, + dst->lptbl.mlc.num_pairs >> 1); + } } return 0; @@ -274,7 +294,6 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) { struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_dev *dev = ns->dev; struct nvme_nvm_id *nvme_nvm_id; struct nvme_nvm_command c = {}; int ret; @@ -287,7 +306,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) if (!nvme_nvm_id) return -ENOMEM; - ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, + ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, nvme_nvm_id, sizeof(struct nvme_nvm_id)); if (ret) { ret = -EIO; @@ -312,9 +331,8 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, nvm_l2p_update_fn *update_l2p, void *priv) { struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; - u32 len = queue_max_hw_sectors(dev->admin_q) << 9; + u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9; u32 nlb_pr_rq = len / sizeof(u64); u64 cmd_slba = slba; void *entries; @@ -332,10 +350,10 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, c.l2p.slba = cpu_to_le64(cmd_slba); c.l2p.nlb = cpu_to_le32(cmd_nlb); - ret = nvme_submit_sync_cmd(dev->admin_q, + ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, entries, len); if (ret) { - dev_err(dev->dev, "L2P table transfer failed (%d)\n", + dev_err(ns->ctrl->dev, "L2P table transfer failed (%d)\n", ret); ret = -EIO; goto out; @@ -361,7 +379,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, { struct request_queue *q = nvmdev->q; struct nvme_ns *ns = q->queuedata; - struct nvme_dev *dev = ns->dev; + struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_nvm_command c = {}; struct nvme_nvm_bb_tbl *bb_tbl; int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blocks; @@ -375,41 +393,36 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, if (!bb_tbl) return -ENOMEM; - ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, + ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c, bb_tbl, tblsz); if (ret) { - dev_err(dev->dev, "get bad block table failed (%d)\n", ret); + dev_err(ctrl->dev, "get bad block table failed (%d)\n", ret); ret = -EIO; goto out; } if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' || bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') { - dev_err(dev->dev, "bbt format mismatch\n"); + dev_err(ctrl->dev, "bbt format mismatch\n"); ret = -EINVAL; goto out; } if (le16_to_cpu(bb_tbl->verid) != 1) { ret = -EINVAL; - dev_err(dev->dev, "bbt version not supported\n"); + dev_err(ctrl->dev, "bbt version not supported\n"); goto out; } if (le32_to_cpu(bb_tbl->tblks) != nr_blocks) { ret = -EINVAL; - dev_err(dev->dev, "bbt unsuspected blocks returned (%u!=%u)", + dev_err(ctrl->dev, "bbt unsuspected blocks returned (%u!=%u)", le32_to_cpu(bb_tbl->tblks), nr_blocks); goto out; } ppa = dev_to_generic_addr(nvmdev, ppa); ret = update_bbtbl(ppa, nr_blocks, bb_tbl->blk, priv); - if (ret) { - ret = -EINTR; - goto out; - } - out: kfree(bb_tbl); return ret; @@ -419,7 +432,6 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd, int type) { struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; int ret = 0; @@ -429,10 +441,10 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd, c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1); c.set_bb.value = type; - ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, + ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, NULL, 0); if (ret) - dev_err(dev->dev, "set bad block table failed (%d)\n", ret); + dev_err(ns->ctrl->dev, "set bad block table failed (%d)\n", ret); return ret; } @@ -453,11 +465,8 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd, static void nvme_nvm_end_io(struct request *rq, int error) { struct nvm_rq *rqd = rq->end_io_data; - struct nvm_dev *dev = rqd->dev; - if (dev->mt && dev->mt->end_io(rqd, error)) - pr_err("nvme: err status: %x result: %lx\n", - rq->errors, (unsigned long)rq->special); + nvm_end_io(rqd, error); kfree(rq->cmd); blk_mq_free_request(rq); @@ -520,9 +529,8 @@ static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd) static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) { struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_dev *dev = ns->dev; - return dma_pool_create(name, dev->dev, PAGE_SIZE, PAGE_SIZE, 0); + return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0); } static void nvme_nvm_destroy_dma_pool(void *pool) @@ -580,8 +588,9 @@ void nvme_nvm_unregister(struct request_queue *q, char *disk_name) int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) { - struct nvme_dev *dev = ns->dev; - struct pci_dev *pdev = to_pci_dev(dev->dev); + struct nvme_ctrl *ctrl = ns->ctrl; + /* XXX: this is poking into PCI structures from generic code! */ + struct pci_dev *pdev = to_pci_dev(ctrl->dev); /* QEMU NVMe simulator - PCI ID + Vendor specific bit */ if (pdev->vendor == PCI_VENDOR_ID_CNEX && diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 044253dca30a..4fb5bb737868 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -19,58 +19,77 @@ #include <linux/kref.h> #include <linux/blk-mq.h> +enum { + /* + * Driver internal status code for commands that were cancelled due + * to timeouts or controller shutdown. The value is negative so + * that it a) doesn't overlap with the unsigned hardware error codes, + * and b) can easily be tested for. + */ + NVME_SC_CANCELLED = -EINTR, +}; + extern unsigned char nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) +extern unsigned char admin_timeout; +#define ADMIN_TIMEOUT (admin_timeout * HZ) + +extern unsigned char shutdown_timeout; +#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) + enum { NVME_NS_LBA = 0, NVME_NS_LIGHTNVM = 1, }; /* - * Represents an NVM Express device. Each nvme_dev is a PCI function. + * List of workarounds for devices that required behavior not specified in + * the standard. */ -struct nvme_dev { - struct list_head node; - struct nvme_queue **queues; +enum nvme_quirks { + /* + * Prefers I/O aligned to a stripe size specified in a vendor + * specific Identify field. + */ + NVME_QUIRK_STRIPE_SIZE = (1 << 0), + + /* + * The controller doesn't handle Identify value others than 0 or 1 + * correctly. + */ + NVME_QUIRK_IDENTIFY_CNS = (1 << 1), +}; + +struct nvme_ctrl { + const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; - struct blk_mq_tag_set tagset; - struct blk_mq_tag_set admin_tagset; - u32 __iomem *dbs; struct device *dev; - struct dma_pool *prp_page_pool; - struct dma_pool *prp_small_pool; + struct kref kref; int instance; - unsigned queue_count; - unsigned online_queues; - unsigned max_qid; - int q_depth; - u32 db_stride; - u32 ctrl_config; - struct msix_entry *entry; - struct nvme_bar __iomem *bar; + struct blk_mq_tag_set *tagset; struct list_head namespaces; - struct kref kref; - struct device *device; - struct work_struct reset_work; - struct work_struct probe_work; - struct work_struct scan_work; + struct mutex namespaces_mutex; + struct device *device; /* char device */ + struct list_head node; + char name[12]; char serial[20]; char model[40]; char firmware_rev[8]; - bool subsystem; + + u32 ctrl_config; + + u32 page_size; u32 max_hw_sectors; u32 stripe_size; - u32 page_size; - void __iomem *cmb; - dma_addr_t cmb_dma_addr; - u64 cmb_size; - u32 cmbsz; u16 oncs; - u16 abort_limit; + atomic_t abort_limit; u8 event_limit; u8 vwc; + u32 vs; + bool subsystem; + unsigned long quirks; }; /* @@ -79,11 +98,14 @@ struct nvme_dev { struct nvme_ns { struct list_head list; - struct nvme_dev *dev; + struct nvme_ctrl *ctrl; struct request_queue *queue; struct gendisk *disk; struct kref kref; + u8 eui[8]; + u8 uuid[16]; + unsigned ns_id; int lba_shift; u16 ms; @@ -94,41 +116,156 @@ struct nvme_ns { u32 mode_select_block_len; }; -/* - * The nvme_iod describes the data in an I/O, including the list of PRP - * entries. You can't see it in this data structure because C doesn't let - * me express that. Use nvme_alloc_iod to ensure there's enough space - * allocated to store the PRP list. - */ -struct nvme_iod { - unsigned long private; /* For the use of the submitter of the I/O */ - int npages; /* In the PRP list. 0 means small pool in use */ - int offset; /* Of PRP list */ - int nents; /* Used in scatterlist */ - int length; /* Of data, in bytes */ - dma_addr_t first_dma; - struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */ - struct scatterlist sg[0]; +struct nvme_ctrl_ops { + int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); + int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); + int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); + bool (*io_incapable)(struct nvme_ctrl *ctrl); + int (*reset_ctrl)(struct nvme_ctrl *ctrl); + void (*free_ctrl)(struct nvme_ctrl *ctrl); }; +static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) +{ + u32 val = 0; + + if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) + return false; + return val & NVME_CSTS_RDY; +} + +static inline bool nvme_io_incapable(struct nvme_ctrl *ctrl) +{ + u32 val = 0; + + if (ctrl->ops->io_incapable(ctrl)) + return false; + if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) + return false; + return val & NVME_CSTS_CFS; +} + +static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) +{ + if (!ctrl->subsystem) + return -ENOTTY; + return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); +} + static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) { return (sector >> (ns->lba_shift - 9)); } +static inline void nvme_setup_flush(struct nvme_ns *ns, + struct nvme_command *cmnd) +{ + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->common.opcode = nvme_cmd_flush; + cmnd->common.nsid = cpu_to_le32(ns->ns_id); +} + +static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd) +{ + u16 control = 0; + u32 dsmgmt = 0; + + if (req->cmd_flags & REQ_FUA) + control |= NVME_RW_FUA; + if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + control |= NVME_RW_LR; + + if (req->cmd_flags & REQ_RAHEAD) + dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.command_id = req->tag; + cmnd->rw.nsid = cpu_to_le32(ns->ns_id); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + + if (ns->ms) { + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + control |= NVME_RW_PRINFO_PRCHK_GUARD; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + control |= NVME_RW_PRINFO_PRCHK_GUARD | + NVME_RW_PRINFO_PRCHK_REF; + cmnd->rw.reftag = cpu_to_le32( + nvme_block_nr(ns, blk_rq_pos(req))); + break; + } + if (!blk_integrity_rq(req)) + control |= NVME_RW_PRINFO_PRACT; + } + + cmnd->rw.control = cpu_to_le16(control); + cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); +} + + +static inline int nvme_error_status(u16 status) +{ + switch (status & 0x7ff) { + case NVME_SC_SUCCESS: + return 0; + case NVME_SC_CAP_EXCEEDED: + return -ENOSPC; + default: + return -EIO; + } +} + +static inline bool nvme_req_needs_retry(struct request *req, u16 status) +{ + return !(status & NVME_SC_DNR || blk_noretry_request(req)) && + (jiffies - req->start_time) < req->timeout; +} + +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, + const struct nvme_ctrl_ops *ops, unsigned long quirks); +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); +void nvme_put_ctrl(struct nvme_ctrl *ctrl); +int nvme_init_identify(struct nvme_ctrl *ctrl); + +void nvme_scan_namespaces(struct nvme_ctrl *ctrl); +void nvme_remove_namespaces(struct nvme_ctrl *ctrl); + +void nvme_stop_queues(struct nvme_ctrl *ctrl); +void nvme_start_queues(struct nvme_ctrl *ctrl); + +struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, unsigned int flags); +void nvme_requeue_req(struct request *req); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, void __user *ubuffer, unsigned bufflen, + void *buffer, unsigned bufflen, u32 *result, unsigned timeout); +int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, u32 *result, + unsigned timeout); +int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, + void __user *meta_buffer, unsigned meta_len, u32 meta_seed, u32 *result, unsigned timeout); -int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id); -int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, +int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id); +int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, struct nvme_id_ns **id); -int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log); -int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, +int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log); +int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, dma_addr_t dma_addr, u32 *result); -int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, +int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result); +int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); + +extern spinlock_t dev_list_lock; struct sg_io_hdr; @@ -154,4 +291,7 @@ static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *i } #endif /* CONFIG_NVM */ +int __init nvme_core_init(void); +void nvme_core_exit(void); + #endif /* _NVME_H */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f5c0e2613c7c..72ef8322d32a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -12,6 +12,7 @@ * more details. */ +#include <linux/aer.h> #include <linux/bitops.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> @@ -28,10 +29,10 @@ #include <linux/kdev_t.h> #include <linux/kthread.h> #include <linux/kernel.h> -#include <linux/list_sort.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/moduleparam.h> +#include <linux/mutex.h> #include <linux/pci.h> #include <linux/poison.h> #include <linux/ptrace.h> @@ -39,23 +40,24 @@ #include <linux/slab.h> #include <linux/t10-pi.h> #include <linux/types.h> -#include <linux/pr.h> -#include <scsi/sg.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <asm/unaligned.h> -#include <uapi/linux/nvme_ioctl.h> #include "nvme.h" -#define NVME_MINORS (1U << MINORBITS) #define NVME_Q_DEPTH 1024 #define NVME_AQ_DEPTH 256 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) -#define ADMIN_TIMEOUT (admin_timeout * HZ) -#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) + +/* + * We handle AEN commands ourselves and don't even let the + * block layer know about them. + */ +#define NVME_NR_AEN_COMMANDS 1 +#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) -static unsigned char admin_timeout = 60; +unsigned char admin_timeout = 60; module_param(admin_timeout, byte, 0644); MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); @@ -63,16 +65,10 @@ unsigned char nvme_io_timeout = 30; module_param_named(io_timeout, nvme_io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); -static unsigned char shutdown_timeout = 5; +unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); -static int nvme_major; -module_param(nvme_major, int, 0); - -static int nvme_char_major; -module_param(nvme_char_major, int, 0); - static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); @@ -80,28 +76,60 @@ static bool use_cmb_sqes = true; module_param(use_cmb_sqes, bool, 0644); MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); -static DEFINE_SPINLOCK(dev_list_lock); static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; static wait_queue_head_t nvme_kthread_wait; -static struct class *nvme_class; +struct nvme_dev; +struct nvme_queue; -static int __nvme_reset(struct nvme_dev *dev); static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); -static void nvme_dead_ctrl(struct nvme_dev *dev); +static void nvme_remove_dead_ctrl(struct nvme_dev *dev); +static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); -struct async_cmd_info { - struct kthread_work work; - struct kthread_worker *worker; - struct request *req; - u32 result; - int status; - void *ctx; +/* + * Represents an NVM Express device. Each nvme_dev is a PCI function. + */ +struct nvme_dev { + struct list_head node; + struct nvme_queue **queues; + struct blk_mq_tag_set tagset; + struct blk_mq_tag_set admin_tagset; + u32 __iomem *dbs; + struct device *dev; + struct dma_pool *prp_page_pool; + struct dma_pool *prp_small_pool; + unsigned queue_count; + unsigned online_queues; + unsigned max_qid; + int q_depth; + u32 db_stride; + struct msix_entry *entry; + void __iomem *bar; + struct work_struct reset_work; + struct work_struct scan_work; + struct work_struct remove_work; + struct mutex shutdown_lock; + bool subsystem; + void __iomem *cmb; + dma_addr_t cmb_dma_addr; + u64 cmb_size; + u32 cmbsz; + unsigned long flags; + +#define NVME_CTRL_RESETTING 0 + + struct nvme_ctrl ctrl; + struct completion ioq_wait; }; +static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_dev, ctrl); +} + /* * An NVM Express queue. Each device has at least two (one for admin * commands and one for I/O commands). @@ -126,7 +154,24 @@ struct nvme_queue { u16 qid; u8 cq_phase; u8 cqe_seen; - struct async_cmd_info cmdinfo; +}; + +/* + * The nvme_iod describes the data in an I/O, including the list of PRP + * entries. You can't see it in this data structure because C doesn't let + * me express that. Use nvme_init_iod to ensure there's enough space + * allocated to store the PRP list. + */ +struct nvme_iod { + struct nvme_queue *nvmeq; + int aborted; + int npages; /* In the PRP list. 0 means small pool in use */ + int nents; /* Used in scatterlist */ + int length; /* Of data, in bytes */ + dma_addr_t first_dma; + struct scatterlist meta_sg; /* metadata requires single contiguous buffer */ + struct scatterlist *sg; + struct scatterlist inline_sg[0]; }; /* @@ -148,23 +193,11 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); } -typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, - struct nvme_completion *); - -struct nvme_cmd_info { - nvme_completion_fn fn; - void *ctx; - int aborted; - struct nvme_queue *nvmeq; - struct nvme_iod iod[0]; -}; - /* * Max size of iod being embedded in the request payload */ #define NVME_INT_PAGES 2 -#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) -#define NVME_INT_MASK 0x01 +#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size) /* * Will slightly overestimate the number of pages needed. This is OK @@ -173,19 +206,22 @@ struct nvme_cmd_info { */ static int nvme_npages(unsigned size, struct nvme_dev *dev) { - unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, + dev->ctrl.page_size); return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } -static unsigned int nvme_cmd_size(struct nvme_dev *dev) +static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev, + unsigned int size, unsigned int nseg) { - unsigned int ret = sizeof(struct nvme_cmd_info); - - ret += sizeof(struct nvme_iod); - ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); - ret += sizeof(struct scatterlist) * NVME_INT_PAGES; + return sizeof(__le64 *) * nvme_npages(size, dev) + + sizeof(struct scatterlist) * nseg; +} - return ret; +static unsigned int nvme_cmd_size(struct nvme_dev *dev) +{ + return sizeof(struct nvme_iod) + + nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES); } static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -215,11 +251,11 @@ static int nvme_admin_init_request(void *data, struct request *req, unsigned int numa_node) { struct nvme_dev *dev = data; - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = dev->queues[0]; BUG_ON(!nvmeq); - cmd->nvmeq = nvmeq; + iod->nvmeq = nvmeq; return 0; } @@ -242,148 +278,36 @@ static int nvme_init_request(void *data, struct request *req, unsigned int numa_node) { struct nvme_dev *dev = data; - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; BUG_ON(!nvmeq); - cmd->nvmeq = nvmeq; + iod->nvmeq = nvmeq; return 0; } -static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, - nvme_completion_fn handler) -{ - cmd->fn = handler; - cmd->ctx = ctx; - cmd->aborted = 0; - blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); -} - -static void *iod_get_private(struct nvme_iod *iod) -{ - return (void *) (iod->private & ~0x1UL); -} - -/* - * If bit 0 is set, the iod is embedded in the request payload. - */ -static bool iod_should_kfree(struct nvme_iod *iod) -{ - return (iod->private & NVME_INT_MASK) == 0; -} - -/* Special values must be less than 0x1000 */ -#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) -#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) -#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) -#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) - -static void special_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - if (ctx == CMD_CTX_CANCELLED) - return; - if (ctx == CMD_CTX_COMPLETED) { - dev_warn(nvmeq->q_dmadev, - "completed id %d twice on queue %d\n", - cqe->command_id, le16_to_cpup(&cqe->sq_id)); - return; - } - if (ctx == CMD_CTX_INVALID) { - dev_warn(nvmeq->q_dmadev, - "invalid id %d completed on queue %d\n", - cqe->command_id, le16_to_cpup(&cqe->sq_id)); - return; - } - dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); -} - -static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) -{ - void *ctx; - - if (fn) - *fn = cmd->fn; - ctx = cmd->ctx; - cmd->fn = special_completion; - cmd->ctx = CMD_CTX_CANCELLED; - return ctx; -} - -static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) +static void nvme_complete_async_event(struct nvme_dev *dev, + struct nvme_completion *cqe) { - u32 result = le32_to_cpup(&cqe->result); - u16 status = le16_to_cpup(&cqe->status) >> 1; + u16 status = le16_to_cpu(cqe->status) >> 1; + u32 result = le32_to_cpu(cqe->result); if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) - ++nvmeq->dev->event_limit; + ++dev->ctrl.event_limit; if (status != NVME_SC_SUCCESS) return; switch (result & 0xff07) { case NVME_AER_NOTICE_NS_CHANGED: - dev_info(nvmeq->q_dmadev, "rescanning\n"); - schedule_work(&nvmeq->dev->scan_work); + dev_info(dev->dev, "rescanning\n"); + queue_work(nvme_workq, &dev->scan_work); default: - dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); + dev_warn(dev->dev, "async event result %08x\n", result); } } -static void abort_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct request *req = ctx; - - u16 status = le16_to_cpup(&cqe->status) >> 1; - u32 result = le32_to_cpup(&cqe->result); - - blk_mq_free_request(req); - - dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); - ++nvmeq->dev->abort_limit; -} - -static void async_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct async_cmd_info *cmdinfo = ctx; - cmdinfo->result = le32_to_cpup(&cqe->result); - cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; - queue_kthread_work(cmdinfo->worker, &cmdinfo->work); - blk_mq_free_request(cmdinfo->req); -} - -static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, - unsigned int tag) -{ - struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag); - - return blk_mq_rq_to_pdu(req); -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, - nvme_completion_fn *fn) -{ - struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); - void *ctx; - if (tag >= nvmeq->q_depth) { - *fn = special_completion; - return CMD_CTX_INVALID; - } - if (fn) - *fn = cmd->fn; - ctx = cmd->ctx; - cmd->fn = special_completion; - cmd->ctx = CMD_CTX_COMPLETED; - return ctx; -} - /** - * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell + * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * @nvmeq: The queue to use * @cmd: The command to send * @@ -405,69 +329,44 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, nvmeq->sq_tail = tail; } -static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) -{ - unsigned long flags; - spin_lock_irqsave(&nvmeq->q_lock, flags); - __nvme_submit_cmd(nvmeq, cmd); - spin_unlock_irqrestore(&nvmeq->q_lock, flags); -} - -static __le64 **iod_list(struct nvme_iod *iod) +static __le64 **iod_list(struct request *req) { - return ((void *)iod) + iod->offset; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + return (__le64 **)(iod->sg + req->nr_phys_segments); } -static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, - unsigned nseg, unsigned long private) +static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) { - iod->private = private; - iod->offset = offsetof(struct nvme_iod, sg[nseg]); - iod->npages = -1; - iod->length = nbytes; - iod->nents = 0; -} - -static struct nvme_iod * -__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, - unsigned long priv, gfp_t gfp) -{ - struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(bytes, dev) + - sizeof(struct scatterlist) * nseg, gfp); - - if (iod) - iod_init(iod, bytes, nseg, priv); - - return iod; -} - -static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, - gfp_t gfp) -{ - unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : - sizeof(struct nvme_dsm_range); - struct nvme_iod *iod; + struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); + int nseg = rq->nr_phys_segments; + unsigned size; - if (rq->nr_phys_segments <= NVME_INT_PAGES && - size <= NVME_INT_BYTES(dev)) { - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); + if (rq->cmd_flags & REQ_DISCARD) + size = sizeof(struct nvme_dsm_range); + else + size = blk_rq_bytes(rq); - iod = cmd->iod; - iod_init(iod, size, rq->nr_phys_segments, - (unsigned long) rq | NVME_INT_MASK); - return iod; + if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { + iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); + if (!iod->sg) + return BLK_MQ_RQ_QUEUE_BUSY; + } else { + iod->sg = iod->inline_sg; } - return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, - (unsigned long) rq, gfp); + iod->aborted = 0; + iod->npages = -1; + iod->nents = 0; + iod->length = size; + return 0; } -static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) +static void nvme_free_iod(struct nvme_dev *dev, struct request *req) { - const int last_prp = dev->page_size / 8 - 1; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + const int last_prp = dev->ctrl.page_size / 8 - 1; int i; - __le64 **list = iod_list(iod); + __le64 **list = iod_list(req); dma_addr_t prp_dma = iod->first_dma; if (iod->npages == 0) @@ -479,20 +378,8 @@ static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) prp_dma = next_prp_dma; } - if (iod_should_kfree(iod)) - kfree(iod); -} - -static int nvme_error_status(u16 status) -{ - switch (status & 0x7ff) { - case NVME_SC_SUCCESS: - return 0; - case NVME_SC_CAP_EXCEEDED: - return -ENOSPC; - default: - return -EIO; - } + if (iod->sg != iod->inline_sg) + kfree(iod->sg); } #ifdef CONFIG_BLK_DEV_INTEGRITY @@ -549,27 +436,6 @@ static void nvme_dif_remap(struct request *req, } kunmap_atomic(pmap); } - -static void nvme_init_integrity(struct nvme_ns *ns) -{ - struct blk_integrity integrity; - - switch (ns->pi_type) { - case NVME_NS_DPS_PI_TYPE3: - integrity.profile = &t10_pi_type3_crc; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - integrity.profile = &t10_pi_type1_crc; - break; - default: - integrity.profile = NULL; - break; - } - integrity.tuple_size = ns->ms; - blk_integrity_register(ns->disk, &integrity); - blk_queue_max_integrity_segments(ns->queue, 1); -} #else /* CONFIG_BLK_DEV_INTEGRITY */ static void nvme_dif_remap(struct request *req, void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) @@ -581,91 +447,27 @@ static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) { } -static void nvme_init_integrity(struct nvme_ns *ns) -{ -} #endif -static void req_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct nvme_iod *iod = ctx; - struct request *req = iod_get_private(iod); - struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); - u16 status = le16_to_cpup(&cqe->status) >> 1; - bool requeue = false; - int error = 0; - - if (unlikely(status)) { - if (!(status & NVME_SC_DNR || blk_noretry_request(req)) - && (jiffies - req->start_time) < req->timeout) { - unsigned long flags; - - requeue = true; - blk_mq_requeue_request(req); - spin_lock_irqsave(req->q->queue_lock, flags); - if (!blk_queue_stopped(req->q)) - blk_mq_kick_requeue_list(req->q); - spin_unlock_irqrestore(req->q->queue_lock, flags); - goto release_iod; - } - - if (req->cmd_type == REQ_TYPE_DRV_PRIV) { - if (cmd_rq->ctx == CMD_CTX_CANCELLED) - error = -EINTR; - else - error = status; - } else { - error = nvme_error_status(status); - } - } - - if (req->cmd_type == REQ_TYPE_DRV_PRIV) { - u32 result = le32_to_cpup(&cqe->result); - req->special = (void *)(uintptr_t)result; - } - - if (cmd_rq->aborted) - dev_warn(nvmeq->dev->dev, - "completing aborted command with status:%04x\n", - error); - -release_iod: - if (iod->nents) { - dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents, - rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (blk_integrity_rq(req)) { - if (!rq_data_dir(req)) - nvme_dif_remap(req, nvme_dif_complete); - dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1, - rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - } - } - nvme_free_iod(nvmeq->dev, iod); - - if (likely(!requeue)) - blk_mq_complete_request(req, error); -} - -/* length is in bytes. gfp flags indicates whether we may sleep. */ -static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, - int total_len, gfp_t gfp) +static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, + int total_len) { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct dma_pool *pool; int length = total_len; struct scatterlist *sg = iod->sg; int dma_len = sg_dma_len(sg); u64 dma_addr = sg_dma_address(sg); - u32 page_size = dev->page_size; + u32 page_size = dev->ctrl.page_size; int offset = dma_addr & (page_size - 1); __le64 *prp_list; - __le64 **list = iod_list(iod); + __le64 **list = iod_list(req); dma_addr_t prp_dma; int nprps, i; length -= (page_size - offset); if (length <= 0) - return total_len; + return true; dma_len -= (page_size - offset); if (dma_len) { @@ -678,7 +480,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, if (length <= page_size) { iod->first_dma = dma_addr; - return total_len; + return true; } nprps = DIV_ROUND_UP(length, page_size); @@ -690,11 +492,11 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, iod->npages = 1; } - prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) { iod->first_dma = dma_addr; iod->npages = -1; - return (total_len - length) + page_size; + return false; } list[0] = prp_list; iod->first_dma = prp_dma; @@ -702,9 +504,9 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, for (;;) { if (i == page_size >> 3) { __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) - return total_len - length; + return false; list[iod->npages++] = prp_list; prp_list[0] = old_prp_list[i - 1]; old_prp_list[i - 1] = cpu_to_le64(prp_dma); @@ -724,115 +526,105 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, dma_len = sg_dma_len(sg); } - return total_len; + return true; } -static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req, - struct nvme_iod *iod) +static int nvme_map_data(struct nvme_dev *dev, struct request *req, + struct nvme_command *cmnd) { - struct nvme_command cmnd; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct request_queue *q = req->q; + enum dma_data_direction dma_dir = rq_data_dir(req) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE; + int ret = BLK_MQ_RQ_QUEUE_ERROR; - memcpy(&cmnd, req->cmd, sizeof(cmnd)); - cmnd.rw.command_id = req->tag; - if (req->nr_phys_segments) { - cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); - } + sg_init_table(iod->sg, req->nr_phys_segments); + iod->nents = blk_rq_map_sg(q, req, iod->sg); + if (!iod->nents) + goto out; - __nvme_submit_cmd(nvmeq, &cmnd); -} + ret = BLK_MQ_RQ_QUEUE_BUSY; + if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir)) + goto out; -/* - * We reuse the small pool to allocate the 16-byte range here as it is not - * worth having a special pool for these or additional cases to handle freeing - * the iod. - */ -static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct request *req, struct nvme_iod *iod) -{ - struct nvme_dsm_range *range = - (struct nvme_dsm_range *)iod_list(iod)[0]; - struct nvme_command cmnd; + if (!nvme_setup_prps(dev, req, blk_rq_bytes(req))) + goto out_unmap; - range->cattr = cpu_to_le32(0); - range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); - range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + ret = BLK_MQ_RQ_QUEUE_ERROR; + if (blk_integrity_rq(req)) { + if (blk_rq_count_integrity_sg(q, req->bio) != 1) + goto out_unmap; - memset(&cmnd, 0, sizeof(cmnd)); - cmnd.dsm.opcode = nvme_cmd_dsm; - cmnd.dsm.command_id = req->tag; - cmnd.dsm.nsid = cpu_to_le32(ns->ns_id); - cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma); - cmnd.dsm.nr = 0; - cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + sg_init_table(&iod->meta_sg, 1); + if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) + goto out_unmap; - __nvme_submit_cmd(nvmeq, &cmnd); -} + if (rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_prep); -static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, - int cmdid) -{ - struct nvme_command cmnd; + if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) + goto out_unmap; + } - memset(&cmnd, 0, sizeof(cmnd)); - cmnd.common.opcode = nvme_cmd_flush; - cmnd.common.command_id = cmdid; - cmnd.common.nsid = cpu_to_le32(ns->ns_id); + cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); + if (blk_integrity_rq(req)) + cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); + return BLK_MQ_RQ_QUEUE_OK; - __nvme_submit_cmd(nvmeq, &cmnd); +out_unmap: + dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); +out: + return ret; } -static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, - struct nvme_ns *ns) +static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { - struct request *req = iod_get_private(iod); - struct nvme_command cmnd; - u16 control = 0; - u32 dsmgmt = 0; - - if (req->cmd_flags & REQ_FUA) - control |= NVME_RW_FUA; - if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) - control |= NVME_RW_LR; - - if (req->cmd_flags & REQ_RAHEAD) - dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - - memset(&cmnd, 0, sizeof(cmnd)); - cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); - cmnd.rw.command_id = req->tag; - cmnd.rw.nsid = cpu_to_le32(ns->ns_id); - cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); - cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); - - if (ns->ms) { - switch (ns->pi_type) { - case NVME_NS_DPS_PI_TYPE3: - control |= NVME_RW_PRINFO_PRCHK_GUARD; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - control |= NVME_RW_PRINFO_PRCHK_GUARD | - NVME_RW_PRINFO_PRCHK_REF; - cmnd.rw.reftag = cpu_to_le32( - nvme_block_nr(ns, blk_rq_pos(req))); - break; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + enum dma_data_direction dma_dir = rq_data_dir(req) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE; + + if (iod->nents) { + dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); + if (blk_integrity_rq(req)) { + if (!rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_complete); + dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); } - if (blk_integrity_rq(req)) - cmnd.rw.metadata = - cpu_to_le64(sg_dma_address(iod->meta_sg)); - else - control |= NVME_RW_PRINFO_PRACT; } - cmnd.rw.control = cpu_to_le16(control); - cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt); + nvme_free_iod(dev, req); +} - __nvme_submit_cmd(nvmeq, &cmnd); +/* + * We reuse the small pool to allocate the 16-byte range here as it is not + * worth having a special pool for these or additional cases to handle freeing + * the iod. + */ +static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_dsm_range *range; - return 0; + range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, + &iod->first_dma); + if (!range) + return BLK_MQ_RQ_QUEUE_BUSY; + iod_list(req)[0] = (__le64 *)range; + iod->npages = 0; + + range->cattr = cpu_to_le32(0); + range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); + range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->dsm.opcode = nvme_cmd_dsm; + cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); + cmnd->dsm.nr = 0; + cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + return BLK_MQ_RQ_QUEUE_OK; } /* @@ -845,9 +637,8 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_queue *nvmeq = hctx->driver_data; struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); - struct nvme_iod *iod; - enum dma_data_direction dma_dir; + struct nvme_command cmnd; + int ret = BLK_MQ_RQ_QUEUE_OK; /* * If formated with metadata, require the block layer provide a buffer @@ -857,91 +648,72 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (ns && ns->ms && !blk_integrity_rq(req)) { if (!(ns->pi_type && ns->ms == 8) && req->cmd_type != REQ_TYPE_DRV_PRIV) { - blk_mq_complete_request(req, -EFAULT); + blk_mq_end_request(req, -EFAULT); return BLK_MQ_RQ_QUEUE_OK; } } - iod = nvme_alloc_iod(req, dev, GFP_ATOMIC); - if (!iod) - return BLK_MQ_RQ_QUEUE_BUSY; + ret = nvme_init_iod(req, dev); + if (ret) + return ret; if (req->cmd_flags & REQ_DISCARD) { - void *range; - /* - * We reuse the small pool to allocate the 16-byte range here - * as it is not worth having a special pool for these or - * additional cases to handle freeing the iod. - */ - range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, - &iod->first_dma); - if (!range) - goto retry_cmd; - iod_list(iod)[0] = (__le64 *)range; - iod->npages = 0; - } else if (req->nr_phys_segments) { - dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; + ret = nvme_setup_discard(nvmeq, ns, req, &cmnd); + } else { + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + memcpy(&cmnd, req->cmd, sizeof(cmnd)); + else if (req->cmd_flags & REQ_FLUSH) + nvme_setup_flush(ns, &cmnd); + else + nvme_setup_rw(ns, req, &cmnd); - sg_init_table(iod->sg, req->nr_phys_segments); - iod->nents = blk_rq_map_sg(req->q, req, iod->sg); - if (!iod->nents) - goto error_cmd; + if (req->nr_phys_segments) + ret = nvme_map_data(dev, req, &cmnd); + } - if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) - goto retry_cmd; + if (ret) + goto out; - if (blk_rq_bytes(req) != - nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { - dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); - goto retry_cmd; - } - if (blk_integrity_rq(req)) { - if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) { - dma_unmap_sg(dev->dev, iod->sg, iod->nents, - dma_dir); - goto error_cmd; - } + cmnd.common.command_id = req->tag; + blk_mq_start_request(req); - sg_init_table(iod->meta_sg, 1); - if (blk_rq_map_integrity_sg( - req->q, req->bio, iod->meta_sg) != 1) { - dma_unmap_sg(dev->dev, iod->sg, iod->nents, - dma_dir); - goto error_cmd; - } + spin_lock_irq(&nvmeq->q_lock); + __nvme_submit_cmd(nvmeq, &cmnd); + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + return BLK_MQ_RQ_QUEUE_OK; +out: + nvme_free_iod(dev, req); + return ret; +} - if (rq_data_dir(req)) - nvme_dif_remap(req, nvme_dif_prep); +static void nvme_complete_rq(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_dev *dev = iod->nvmeq->dev; + int error = 0; - if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) { - dma_unmap_sg(dev->dev, iod->sg, iod->nents, - dma_dir); - goto error_cmd; - } + nvme_unmap_data(dev, req); + + if (unlikely(req->errors)) { + if (nvme_req_needs_retry(req, req->errors)) { + nvme_requeue_req(req); + return; } - } - nvme_set_info(cmd, iod, req_completion); - spin_lock_irq(&nvmeq->q_lock); - if (req->cmd_type == REQ_TYPE_DRV_PRIV) - nvme_submit_priv(nvmeq, req, iod); - else if (req->cmd_flags & REQ_DISCARD) - nvme_submit_discard(nvmeq, ns, req, iod); - else if (req->cmd_flags & REQ_FLUSH) - nvme_submit_flush(nvmeq, ns, req->tag); - else - nvme_submit_iod(nvmeq, iod, ns); + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + error = req->errors; + else + error = nvme_error_status(req->errors); + } - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); - return BLK_MQ_RQ_QUEUE_OK; + if (unlikely(iod->aborted)) { + dev_warn(dev->dev, + "completing aborted command with status: %04x\n", + req->errors); + } - error_cmd: - nvme_free_iod(dev, iod); - return BLK_MQ_RQ_QUEUE_ERROR; - retry_cmd: - nvme_free_iod(dev, iod); - return BLK_MQ_RQ_QUEUE_BUSY; + blk_mq_end_request(req, error); } static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) @@ -952,20 +724,47 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) phase = nvmeq->cq_phase; for (;;) { - void *ctx; - nvme_completion_fn fn; struct nvme_completion cqe = nvmeq->cqes[head]; - if ((le16_to_cpu(cqe.status) & 1) != phase) + u16 status = le16_to_cpu(cqe.status); + struct request *req; + + if ((status & 1) != phase) break; nvmeq->sq_head = le16_to_cpu(cqe.sq_head); if (++head == nvmeq->q_depth) { head = 0; phase = !phase; } + if (tag && *tag == cqe.command_id) *tag = -1; - ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); - fn(nvmeq, ctx, &cqe); + + if (unlikely(cqe.command_id >= nvmeq->q_depth)) { + dev_warn(nvmeq->q_dmadev, + "invalid id %d completed on queue %d\n", + cqe.command_id, le16_to_cpu(cqe.sq_id)); + continue; + } + + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvmeq->qid == 0 && + cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) { + nvme_complete_async_event(nvmeq->dev, &cqe); + continue; + } + + req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id); + if (req->cmd_type == REQ_TYPE_DRV_PRIV) { + u32 result = le32_to_cpu(cqe.result); + req->special = (void *)(uintptr_t)result; + } + blk_mq_complete_request(req, status >> 1); + } /* If the controller ignores the cq head doorbell and continuously @@ -1028,112 +827,15 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) return 0; } -/* - * Returns 0 on success. If the result is negative, it's a Linux error code; - * if the result is positive, it's an NVM Express status code - */ -int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, void __user *ubuffer, unsigned bufflen, - u32 *result, unsigned timeout) -{ - bool write = cmd->common.opcode & 1; - struct bio *bio = NULL; - struct request *req; - int ret; - - req = blk_mq_alloc_request(q, write, 0); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->cmd_type = REQ_TYPE_DRV_PRIV; - req->cmd_flags |= REQ_FAILFAST_DRIVER; - req->__data_len = 0; - req->__sector = (sector_t) -1; - req->bio = req->biotail = NULL; - - req->timeout = timeout ? timeout : ADMIN_TIMEOUT; - - req->cmd = (unsigned char *)cmd; - req->cmd_len = sizeof(struct nvme_command); - req->special = (void *)0; - - if (buffer && bufflen) { - ret = blk_rq_map_kern(q, req, buffer, bufflen, - __GFP_DIRECT_RECLAIM); - if (ret) - goto out; - } else if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, - __GFP_DIRECT_RECLAIM); - if (ret) - goto out; - bio = req->bio; - } - - blk_execute_rq(req->q, NULL, req, 0); - if (bio) - blk_rq_unmap_user(bio); - if (result) - *result = (u32)(uintptr_t)req->special; - ret = req->errors; - out: - blk_mq_free_request(req); - return ret; -} - -int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, unsigned bufflen) -{ - return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0); -} - -static int nvme_submit_async_admin_req(struct nvme_dev *dev) +static void nvme_submit_async_event(struct nvme_dev *dev) { - struct nvme_queue *nvmeq = dev->queues[0]; struct nvme_command c; - struct nvme_cmd_info *cmd_info; - struct request *req; - - req = blk_mq_alloc_request(dev->admin_q, WRITE, - BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->cmd_flags |= REQ_NO_TIMEOUT; - cmd_info = blk_mq_rq_to_pdu(req); - nvme_set_info(cmd_info, NULL, async_req_completion); memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; - c.common.command_id = req->tag; - - blk_mq_free_request(req); - __nvme_submit_cmd(nvmeq, &c); - return 0; -} - -static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, - struct nvme_command *cmd, - struct async_cmd_info *cmdinfo, unsigned timeout) -{ - struct nvme_queue *nvmeq = dev->queues[0]; - struct request *req; - struct nvme_cmd_info *cmd_rq; - - req = blk_mq_alloc_request(dev->admin_q, WRITE, 0); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->timeout = timeout; - cmd_rq = blk_mq_rq_to_pdu(req); - cmdinfo->req = req; - nvme_set_info(cmd_rq, cmdinfo, async_completion); - cmdinfo->status = -EINTR; + c.common.command_id = NVME_AQ_BLKMQ_DEPTH + --dev->ctrl.event_limit; - cmd->common.command_id = req->tag; - - nvme_submit_cmd(nvmeq, cmd); - return 0; + __nvme_submit_cmd(dev->queues[0], &c); } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) @@ -1144,7 +846,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) c.delete_queue.opcode = opcode; c.delete_queue.qid = cpu_to_le16(id); - return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); } static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, @@ -1165,7 +867,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, c.create_cq.cq_flags = cpu_to_le16(flags); c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); - return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); } static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, @@ -1186,7 +888,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, c.create_sq.sq_flags = cpu_to_le16(flags); c.create_sq.cqid = cpu_to_le16(qid); - return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); } static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) @@ -1199,195 +901,111 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) -{ - struct nvme_command c = { }; - int error; - - /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ - c.identify.opcode = nvme_admin_identify; - c.identify.cns = cpu_to_le32(1); - - *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); - if (!*id) - return -ENOMEM; - - error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, - sizeof(struct nvme_id_ctrl)); - if (error) - kfree(*id); - return error; -} - -int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, - struct nvme_id_ns **id) -{ - struct nvme_command c = { }; - int error; - - /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ - c.identify.opcode = nvme_admin_identify, - c.identify.nsid = cpu_to_le32(nsid), - - *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); - if (!*id) - return -ENOMEM; - - error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, - sizeof(struct nvme_id_ns)); - if (error) - kfree(*id); - return error; -} - -int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, - dma_addr_t dma_addr, u32 *result) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_get_features; - c.features.nsid = cpu_to_le32(nsid); - c.features.prp1 = cpu_to_le64(dma_addr); - c.features.fid = cpu_to_le32(fid); - - return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, - result, 0); -} - -int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, - dma_addr_t dma_addr, u32 *result) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_set_features; - c.features.prp1 = cpu_to_le64(dma_addr); - c.features.fid = cpu_to_le32(fid); - c.features.dword11 = cpu_to_le32(dword11); - - return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, - result, 0); -} - -int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log) +static void abort_endio(struct request *req, int error) { - struct nvme_command c = { }; - int error; - - c.common.opcode = nvme_admin_get_log_page, - c.common.nsid = cpu_to_le32(0xFFFFFFFF), - c.common.cdw10[0] = cpu_to_le32( - (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | - NVME_LOG_SMART), + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = iod->nvmeq; + u32 result = (u32)(uintptr_t)req->special; + u16 status = req->errors; - *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); - if (!*log) - return -ENOMEM; + dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); + atomic_inc(&nvmeq->dev->ctrl.abort_limit); - error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, - sizeof(struct nvme_smart_log)); - if (error) - kfree(*log); - return error; + blk_mq_free_request(req); } -/** - * nvme_abort_req - Attempt aborting a request - * - * Schedule controller reset if the command was already aborted once before and - * still hasn't been returned to the driver, or if this is the admin queue. - */ -static void nvme_abort_req(struct request *req) +static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { - struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = cmd_rq->nvmeq; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = iod->nvmeq; struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; - struct nvme_cmd_info *abort_cmd; struct nvme_command cmd; - if (!nvmeq->qid || cmd_rq->aborted) { - spin_lock(&dev_list_lock); - if (!__nvme_reset(dev)) { - dev_warn(dev->dev, - "I/O %d QID %d timeout, reset controller\n", - req->tag, nvmeq->qid); - } - spin_unlock(&dev_list_lock); - return; + /* + * Shutdown immediately if controller times out while starting. The + * reset work will see the pci device disabled when it gets the forced + * cancellation error. All outstanding requests are completed on + * shutdown, so we return BLK_EH_HANDLED. + */ + if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) { + dev_warn(dev->dev, + "I/O %d QID %d timeout, disable controller\n", + req->tag, nvmeq->qid); + nvme_dev_disable(dev, false); + req->errors = NVME_SC_CANCELLED; + return BLK_EH_HANDLED; } - if (!dev->abort_limit) - return; + /* + * Shutdown the controller immediately and schedule a reset if the + * command was already aborted once before and still hasn't been + * returned to the driver, or if this is the admin queue. + */ + if (!nvmeq->qid || iod->aborted) { + dev_warn(dev->dev, + "I/O %d QID %d timeout, reset controller\n", + req->tag, nvmeq->qid); + nvme_dev_disable(dev, false); + queue_work(nvme_workq, &dev->reset_work); - abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, - BLK_MQ_REQ_NOWAIT); - if (IS_ERR(abort_req)) - return; + /* + * Mark the request as handled, since the inline shutdown + * forces all outstanding requests to complete. + */ + req->errors = NVME_SC_CANCELLED; + return BLK_EH_HANDLED; + } - abort_cmd = blk_mq_rq_to_pdu(abort_req); - nvme_set_info(abort_cmd, abort_req, abort_completion); + iod->aborted = 1; + + if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { + atomic_inc(&dev->ctrl.abort_limit); + return BLK_EH_RESET_TIMER; + } memset(&cmd, 0, sizeof(cmd)); cmd.abort.opcode = nvme_admin_abort_cmd; cmd.abort.cid = req->tag; cmd.abort.sqid = cpu_to_le16(nvmeq->qid); - cmd.abort.command_id = abort_req->tag; - --dev->abort_limit; - cmd_rq->aborted = 1; + dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n", + req->tag, nvmeq->qid); - dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, - nvmeq->qid); - nvme_submit_cmd(dev->queues[0], &cmd); + abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, + BLK_MQ_REQ_NOWAIT); + if (IS_ERR(abort_req)) { + atomic_inc(&dev->ctrl.abort_limit); + return BLK_EH_RESET_TIMER; + } + + abort_req->timeout = ADMIN_TIMEOUT; + abort_req->end_io_data = NULL; + blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); + + /* + * The aborted req will be completed on receiving the abort req. + * We enable the timer again. If hit twice, it'll cause a device reset, + * as the device then is in a faulty state. + */ + return BLK_EH_RESET_TIMER; } static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) { struct nvme_queue *nvmeq = data; - void *ctx; - nvme_completion_fn fn; - struct nvme_cmd_info *cmd; - struct nvme_completion cqe; + int status; if (!blk_mq_request_started(req)) return; - cmd = blk_mq_rq_to_pdu(req); - - if (cmd->ctx == CMD_CTX_CANCELLED) - return; + dev_warn(nvmeq->q_dmadev, + "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid); + status = NVME_SC_ABORT_REQ; if (blk_queue_dying(req->q)) - cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); - else - cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); - - - dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", - req->tag, nvmeq->qid); - ctx = cancel_cmd_info(cmd, &fn); - fn(nvmeq, ctx, &cqe); -} - -static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) -{ - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = cmd->nvmeq; - - dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, - nvmeq->qid); - spin_lock_irq(&nvmeq->q_lock); - nvme_abort_req(req); - spin_unlock_irq(&nvmeq->q_lock); - - /* - * The aborted req will be completed on receiving the abort req. - * We enable the timer again. If hit twice, it'll cause a device reset, - * as the device then is in a faulty state. - */ - return BLK_EH_RESET_TIMER; + status |= NVME_SC_DNR; + blk_mq_complete_request(req, status); } static void nvme_free_queue(struct nvme_queue *nvmeq) @@ -1430,8 +1048,8 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) nvmeq->cq_vector = -1; spin_unlock_irq(&nvmeq->q_lock); - if (!nvmeq->qid && nvmeq->dev->admin_q) - blk_mq_freeze_queue_start(nvmeq->dev->admin_q); + if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) + blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q); irq_set_affinity_hint(vector, NULL); free_irq(vector, nvmeq); @@ -1447,21 +1065,20 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq) spin_unlock_irq(&nvmeq->q_lock); } -static void nvme_disable_queue(struct nvme_dev *dev, int qid) +static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { - struct nvme_queue *nvmeq = dev->queues[qid]; + struct nvme_queue *nvmeq = dev->queues[0]; if (!nvmeq) return; if (nvme_suspend_queue(nvmeq)) return; - /* Don't tell the adapter to delete the admin queue. - * Don't tell a removed adapter to delete IO queues. */ - if (qid && readl(&dev->bar->csts) != -1) { - adapter_delete_sq(dev, qid); - adapter_delete_cq(dev, qid); - } + if (shutdown) + nvme_shutdown_ctrl(&dev->ctrl); + else + nvme_disable_ctrl(&dev->ctrl, lo_hi_readq( + dev->bar + NVME_REG_CAP)); spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); @@ -1472,11 +1089,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, int entry_size) { int q_depth = dev->q_depth; - unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size); + unsigned q_size_aligned = roundup(q_depth * entry_size, + dev->ctrl.page_size); if (q_size_aligned * nr_io_queues > dev->cmb_size) { u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); - mem_per_q = round_down(mem_per_q, dev->page_size); + mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); q_depth = div_u64(mem_per_q, entry_size); /* @@ -1495,8 +1113,8 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, int qid, int depth) { if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { - unsigned offset = (qid - 1) * - roundup(SQ_SIZE(depth), dev->page_size); + unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), + dev->ctrl.page_size); nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; nvmeq->sq_cmds_io = dev->cmb + offset; } else { @@ -1527,7 +1145,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_dmadev = dev->dev; nvmeq->dev = dev; snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", - dev->instance, qid); + dev->ctrl.instance, qid); spin_lock_init(&nvmeq->q_lock); nvmeq->cq_head = 0; nvmeq->cq_phase = 1; @@ -1604,79 +1222,9 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) return result; } -static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) -{ - unsigned long timeout; - u32 bit = enabled ? NVME_CSTS_RDY : 0; - - timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; - - while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { - msleep(100); - if (fatal_signal_pending(current)) - return -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(dev->dev, - "Device not ready; aborting %s\n", enabled ? - "initialisation" : "reset"); - return -ENODEV; - } - } - - return 0; -} - -/* - * If the device has been passed off to us in an enabled state, just clear - * the enabled bit. The spec says we should set the 'shutdown notification - * bits', but doing so may cause the device to complete commands to the - * admin queue ... and we don't know what memory that might be pointing at! - */ -static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) -{ - dev->ctrl_config &= ~NVME_CC_SHN_MASK; - dev->ctrl_config &= ~NVME_CC_ENABLE; - writel(dev->ctrl_config, &dev->bar->cc); - - return nvme_wait_ready(dev, cap, false); -} - -static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) -{ - dev->ctrl_config &= ~NVME_CC_SHN_MASK; - dev->ctrl_config |= NVME_CC_ENABLE; - writel(dev->ctrl_config, &dev->bar->cc); - - return nvme_wait_ready(dev, cap, true); -} - -static int nvme_shutdown_ctrl(struct nvme_dev *dev) -{ - unsigned long timeout; - - dev->ctrl_config &= ~NVME_CC_SHN_MASK; - dev->ctrl_config |= NVME_CC_SHN_NORMAL; - - writel(dev->ctrl_config, &dev->bar->cc); - - timeout = SHUTDOWN_TIMEOUT + jiffies; - while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != - NVME_CSTS_SHST_CMPLT) { - msleep(100); - if (fatal_signal_pending(current)) - return -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(dev->dev, - "Device shutdown incomplete; abort shutdown\n"); - return -ENODEV; - } - } - - return 0; -} - static struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, + .complete = nvme_complete_rq, .map_queue = blk_mq_map_queue, .init_hctx = nvme_admin_init_hctx, .exit_hctx = nvme_admin_exit_hctx, @@ -1686,6 +1234,7 @@ static struct blk_mq_ops nvme_mq_admin_ops = { static struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, + .complete = nvme_complete_rq, .map_queue = blk_mq_map_queue, .init_hctx = nvme_init_hctx, .init_request = nvme_init_request, @@ -1695,19 +1244,23 @@ static struct blk_mq_ops nvme_mq_ops = { static void nvme_dev_remove_admin(struct nvme_dev *dev) { - if (dev->admin_q && !blk_queue_dying(dev->admin_q)) { - blk_cleanup_queue(dev->admin_q); + if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { + blk_cleanup_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } } static int nvme_alloc_admin_tags(struct nvme_dev *dev) { - if (!dev->admin_q) { + if (!dev->ctrl.admin_q) { dev->admin_tagset.ops = &nvme_mq_admin_ops; dev->admin_tagset.nr_hw_queues = 1; - dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; - dev->admin_tagset.reserved_tags = 1; + + /* + * Subtract one to leave an empty queue entry for 'Full Queue' + * condition. See NVM-Express 1.2 specification, section 4.1.2. + */ + dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(dev->dev); dev->admin_tagset.cmd_size = nvme_cmd_size(dev); @@ -1716,18 +1269,18 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(&dev->admin_tagset)) return -ENOMEM; - dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); - if (IS_ERR(dev->admin_q)) { + dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); + if (IS_ERR(dev->ctrl.admin_q)) { blk_mq_free_tag_set(&dev->admin_tagset); return -ENOMEM; } - if (!blk_get_queue(dev->admin_q)) { + if (!blk_get_queue(dev->ctrl.admin_q)) { nvme_dev_remove_admin(dev); - dev->admin_q = NULL; + dev->ctrl.admin_q = NULL; return -ENODEV; } } else - blk_mq_unfreeze_queue(dev->admin_q); + blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); return 0; } @@ -1736,31 +1289,17 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) { int result; u32 aqa; - u64 cap = lo_hi_readq(&dev->bar->cap); + u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); struct nvme_queue *nvmeq; - /* - * default to a 4K page size, with the intention to update this - * path in the future to accomodate architectures with differing - * kernel and IO page sizes. - */ - unsigned page_shift = 12; - unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12; - - if (page_shift < dev_page_min) { - dev_err(dev->dev, - "Minimum device page size (%u) too large for " - "host (%u)\n", 1 << dev_page_min, - 1 << page_shift); - return -ENODEV; - } - dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ? + dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ? NVME_CAP_NSSRC(cap) : 0; - if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO)) - writel(NVME_CSTS_NSSRO, &dev->bar->csts); + if (dev->subsystem && + (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) + writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); - result = nvme_disable_ctrl(dev, cap); + result = nvme_disable_ctrl(&dev->ctrl, cap); if (result < 0) return result; @@ -1774,18 +1313,11 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; - dev->page_size = 1 << page_shift; - - dev->ctrl_config = NVME_CC_CSS_NVM; - dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; - dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; - dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; - - writel(aqa, &dev->bar->aqa); - lo_hi_writeq(nvmeq->sq_dma_addr, &dev->bar->asq); - lo_hi_writeq(nvmeq->cq_dma_addr, &dev->bar->acq); + writel(aqa, dev->bar + NVME_REG_AQA); + lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); + lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); - result = nvme_enable_ctrl(dev, cap); + result = nvme_enable_ctrl(&dev->ctrl, cap); if (result) goto free_nvmeq; @@ -1803,406 +1335,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } -static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) -{ - struct nvme_dev *dev = ns->dev; - struct nvme_user_io io; - struct nvme_command c; - unsigned length, meta_len; - int status, write; - dma_addr_t meta_dma = 0; - void *meta = NULL; - void __user *metadata; - - if (copy_from_user(&io, uio, sizeof(io))) - return -EFAULT; - - switch (io.opcode) { - case nvme_cmd_write: - case nvme_cmd_read: - case nvme_cmd_compare: - break; - default: - return -EINVAL; - } - - length = (io.nblocks + 1) << ns->lba_shift; - meta_len = (io.nblocks + 1) * ns->ms; - metadata = (void __user *)(uintptr_t)io.metadata; - write = io.opcode & 1; - - if (ns->ext) { - length += meta_len; - meta_len = 0; - } - if (meta_len) { - if (((io.metadata & 3) || !io.metadata) && !ns->ext) - return -EINVAL; - - meta = dma_alloc_coherent(dev->dev, meta_len, - &meta_dma, GFP_KERNEL); - - if (!meta) { - status = -ENOMEM; - goto unmap; - } - if (write) { - if (copy_from_user(meta, metadata, meta_len)) { - status = -EFAULT; - goto unmap; - } - } - } - - memset(&c, 0, sizeof(c)); - c.rw.opcode = io.opcode; - c.rw.flags = io.flags; - c.rw.nsid = cpu_to_le32(ns->ns_id); - c.rw.slba = cpu_to_le64(io.slba); - c.rw.length = cpu_to_le16(io.nblocks); - c.rw.control = cpu_to_le16(io.control); - c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); - c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); - c.rw.metadata = cpu_to_le64(meta_dma); - - status = __nvme_submit_sync_cmd(ns->queue, &c, NULL, - (void __user *)(uintptr_t)io.addr, length, NULL, 0); - unmap: - if (meta) { - if (status == NVME_SC_SUCCESS && !write) { - if (copy_to_user(metadata, meta, meta_len)) - status = -EFAULT; - } - dma_free_coherent(dev->dev, meta_len, meta, meta_dma); - } - return status; -} - -static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd) -{ - struct nvme_passthru_cmd cmd; - struct nvme_command c; - unsigned timeout = 0; - int status; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); - c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); - c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); - c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); - c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); - c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); - - status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c, - NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len, - &cmd.result, timeout); - if (status >= 0) { - if (put_user(cmd.result, &ucmd->result)) - return -EFAULT; - } - - return status; -} - -static int nvme_subsys_reset(struct nvme_dev *dev) -{ - if (!dev->subsystem) - return -ENOTTY; - - writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */ - return 0; -} - -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, - unsigned long arg) -{ - struct nvme_ns *ns = bdev->bd_disk->private_data; - - switch (cmd) { - case NVME_IOCTL_ID: - force_successful_syscall_return(); - return ns->ns_id; - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ns->dev, NULL, (void __user *)arg); - case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->dev, ns, (void __user *)arg); - case NVME_IOCTL_SUBMIT_IO: - return nvme_submit_io(ns, (void __user *)arg); - case SG_GET_VERSION_NUM: - return nvme_sg_get_version_num((void __user *)arg); - case SG_IO: - return nvme_sg_io(ns, (void __user *)arg); - default: - return -ENOTTY; - } -} - -#ifdef CONFIG_COMPAT -static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - switch (cmd) { - case SG_IO: - return -ENOIOCTLCMD; - } - return nvme_ioctl(bdev, mode, cmd, arg); -} -#else -#define nvme_compat_ioctl NULL -#endif - -static void nvme_free_dev(struct kref *kref); -static void nvme_free_ns(struct kref *kref) -{ - struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); - - if (ns->type == NVME_NS_LIGHTNVM) - nvme_nvm_unregister(ns->queue, ns->disk->disk_name); - - spin_lock(&dev_list_lock); - ns->disk->private_data = NULL; - spin_unlock(&dev_list_lock); - - kref_put(&ns->dev->kref, nvme_free_dev); - put_disk(ns->disk); - kfree(ns); -} - -static int nvme_open(struct block_device *bdev, fmode_t mode) -{ - int ret = 0; - struct nvme_ns *ns; - - spin_lock(&dev_list_lock); - ns = bdev->bd_disk->private_data; - if (!ns) - ret = -ENXIO; - else if (!kref_get_unless_zero(&ns->kref)) - ret = -ENXIO; - spin_unlock(&dev_list_lock); - - return ret; -} - -static void nvme_release(struct gendisk *disk, fmode_t mode) -{ - struct nvme_ns *ns = disk->private_data; - kref_put(&ns->kref, nvme_free_ns); -} - -static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) -{ - /* some standard values */ - geo->heads = 1 << 6; - geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bd->bd_disk) >> 11; - return 0; -} - -static void nvme_config_discard(struct nvme_ns *ns) -{ - u32 logical_block_size = queue_logical_block_size(ns->queue); - ns->queue->limits.discard_zeroes_data = 0; - ns->queue->limits.discard_alignment = logical_block_size; - ns->queue->limits.discard_granularity = logical_block_size; - blk_queue_max_discard_sectors(ns->queue, 0xffffffff); - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); -} - -static int nvme_revalidate_disk(struct gendisk *disk) -{ - struct nvme_ns *ns = disk->private_data; - struct nvme_dev *dev = ns->dev; - struct nvme_id_ns *id; - u8 lbaf, pi_type; - u16 old_ms; - unsigned short bs; - - if (nvme_identify_ns(dev, ns->ns_id, &id)) { - dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__, - dev->instance, ns->ns_id); - return -ENODEV; - } - if (id->ncap == 0) { - kfree(id); - return -ENODEV; - } - - if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { - if (nvme_nvm_register(ns->queue, disk->disk_name)) { - dev_warn(dev->dev, - "%s: LightNVM init failure\n", __func__); - kfree(id); - return -ENODEV; - } - ns->type = NVME_NS_LIGHTNVM; - } - - old_ms = ns->ms; - lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; - ns->lba_shift = id->lbaf[lbaf].ds; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); - ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); - - /* - * If identify namespace failed, use default 512 byte block size so - * block layer can use before failing read/write for 0 capacity. - */ - if (ns->lba_shift == 0) - ns->lba_shift = 9; - bs = 1 << ns->lba_shift; - - /* XXX: PI implementation requires metadata equal t10 pi tuple size */ - pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? - id->dps & NVME_NS_DPS_PI_MASK : 0; - - blk_mq_freeze_queue(disk->queue); - if (blk_get_integrity(disk) && (ns->pi_type != pi_type || - ns->ms != old_ms || - bs != queue_logical_block_size(disk->queue) || - (ns->ms && ns->ext))) - blk_integrity_unregister(disk); - - ns->pi_type = pi_type; - blk_queue_logical_block_size(ns->queue, bs); - - if (ns->ms && !ns->ext) - nvme_init_integrity(ns); - - if ((ns->ms && !(ns->ms == 8 && ns->pi_type) && - !blk_get_integrity(disk)) || - ns->type == NVME_NS_LIGHTNVM) - set_capacity(disk, 0); - else - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - - if (dev->oncs & NVME_CTRL_ONCS_DSM) - nvme_config_discard(ns); - blk_mq_unfreeze_queue(disk->queue); - - kfree(id); - return 0; -} - -static char nvme_pr_type(enum pr_type type) -{ - switch (type) { - case PR_WRITE_EXCLUSIVE: - return 1; - case PR_EXCLUSIVE_ACCESS: - return 2; - case PR_WRITE_EXCLUSIVE_REG_ONLY: - return 3; - case PR_EXCLUSIVE_ACCESS_REG_ONLY: - return 4; - case PR_WRITE_EXCLUSIVE_ALL_REGS: - return 5; - case PR_EXCLUSIVE_ACCESS_ALL_REGS: - return 6; - default: - return 0; - } -}; - -static int nvme_pr_command(struct block_device *bdev, u32 cdw10, - u64 key, u64 sa_key, u8 op) -{ - struct nvme_ns *ns = bdev->bd_disk->private_data; - struct nvme_command c; - u8 data[16] = { 0, }; - - put_unaligned_le64(key, &data[0]); - put_unaligned_le64(sa_key, &data[8]); - - memset(&c, 0, sizeof(c)); - c.common.opcode = op; - c.common.nsid = cpu_to_le32(ns->ns_id); - c.common.cdw10[0] = cpu_to_le32(cdw10); - - return nvme_submit_sync_cmd(ns->queue, &c, data, 16); -} - -static int nvme_pr_register(struct block_device *bdev, u64 old, - u64 new, unsigned flags) -{ - u32 cdw10; - - if (flags & ~PR_FL_IGNORE_KEY) - return -EOPNOTSUPP; - - cdw10 = old ? 2 : 0; - cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; - cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); -} - -static int nvme_pr_reserve(struct block_device *bdev, u64 key, - enum pr_type type, unsigned flags) -{ - u32 cdw10; - - if (flags & ~PR_FL_IGNORE_KEY) - return -EOPNOTSUPP; - - cdw10 = nvme_pr_type(type) << 8; - cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); -} - -static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, - enum pr_type type, bool abort) -{ - u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1; - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); -} - -static int nvme_pr_clear(struct block_device *bdev, u64 key) -{ - u32 cdw10 = 1 | (key ? 1 << 3 : 0); - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); -} - -static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) -{ - u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0; - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); -} - -static const struct pr_ops nvme_pr_ops = { - .pr_register = nvme_pr_register, - .pr_reserve = nvme_pr_reserve, - .pr_release = nvme_pr_release, - .pr_preempt = nvme_pr_preempt, - .pr_clear = nvme_pr_clear, -}; - -static const struct block_device_operations nvme_fops = { - .owner = THIS_MODULE, - .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, - .open = nvme_open, - .release = nvme_release, - .getgeo = nvme_getgeo, - .revalidate_disk= nvme_revalidate_disk, - .pr_ops = &nvme_pr_ops, -}; - static int nvme_kthread(void *data) { struct nvme_dev *dev, *next; @@ -2212,14 +1344,20 @@ static int nvme_kthread(void *data) spin_lock(&dev_list_lock); list_for_each_entry_safe(dev, next, &dev_list, node) { int i; - u32 csts = readl(&dev->bar->csts); + u32 csts = readl(dev->bar + NVME_REG_CSTS); + + /* + * Skip controllers currently under reset. + */ + if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work)) + continue; if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || csts & NVME_CSTS_CFS) { - if (!__nvme_reset(dev)) { + if (queue_work(nvme_workq, &dev->reset_work)) { dev_warn(dev->dev, "Failed status: %x, reset controller\n", - readl(&dev->bar->csts)); + readl(dev->bar + NVME_REG_CSTS)); } continue; } @@ -2230,11 +1368,8 @@ static int nvme_kthread(void *data) spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); - while ((i == 0) && (dev->event_limit > 0)) { - if (nvme_submit_async_admin_req(dev)) - break; - dev->event_limit--; - } + while (i == 0 && dev->ctrl.event_limit > 0) + nvme_submit_async_event(dev); spin_unlock_irq(&nvmeq->q_lock); } } @@ -2244,127 +1379,33 @@ static int nvme_kthread(void *data) return 0; } -static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) -{ - struct nvme_ns *ns; - struct gendisk *disk; - int node = dev_to_node(dev->dev); - - ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); - if (!ns) - return; - - ns->queue = blk_mq_init_queue(&dev->tagset); - if (IS_ERR(ns->queue)) - goto out_free_ns; - queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); - ns->dev = dev; - ns->queue->queuedata = ns; - - disk = alloc_disk_node(0, node); - if (!disk) - goto out_free_queue; - - kref_init(&ns->kref); - ns->ns_id = nsid; - ns->disk = disk; - ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ - list_add_tail(&ns->list, &dev->namespaces); - - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - if (dev->max_hw_sectors) { - blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); - blk_queue_max_segments(ns->queue, - (dev->max_hw_sectors / (dev->page_size >> 9)) + 1); - } - if (dev->stripe_size) - blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); - if (dev->vwc & NVME_CTRL_VWC_PRESENT) - blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); - blk_queue_virt_boundary(ns->queue, dev->page_size - 1); - - disk->major = nvme_major; - disk->first_minor = 0; - disk->fops = &nvme_fops; - disk->private_data = ns; - disk->queue = ns->queue; - disk->driverfs_dev = dev->device; - disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); - - /* - * Initialize capacity to 0 until we establish the namespace format and - * setup integrity extentions if necessary. The revalidate_disk after - * add_disk allows the driver to register with integrity if the format - * requires it. - */ - set_capacity(disk, 0); - if (nvme_revalidate_disk(ns->disk)) - goto out_free_disk; - - kref_get(&dev->kref); - if (ns->type != NVME_NS_LIGHTNVM) { - add_disk(ns->disk); - if (ns->ms) { - struct block_device *bd = bdget_disk(ns->disk, 0); - if (!bd) - return; - if (blkdev_get(bd, FMODE_READ, NULL)) { - bdput(bd); - return; - } - blkdev_reread_part(bd); - blkdev_put(bd, FMODE_READ); - } - } - return; - out_free_disk: - kfree(disk); - list_del(&ns->list); - out_free_queue: - blk_cleanup_queue(ns->queue); - out_free_ns: - kfree(ns); -} - -/* - * Create I/O queues. Failing to create an I/O queue is not an issue, - * we can continue with less than the desired amount of queues, and - * even a controller without I/O queues an still be used to issue - * admin commands. This might be useful to upgrade a buggy firmware - * for example. - */ -static void nvme_create_io_queues(struct nvme_dev *dev) +static int nvme_create_io_queues(struct nvme_dev *dev) { unsigned i; + int ret = 0; - for (i = dev->queue_count; i <= dev->max_qid; i++) - if (!nvme_alloc_queue(dev, i, dev->q_depth)) + for (i = dev->queue_count; i <= dev->max_qid; i++) { + if (!nvme_alloc_queue(dev, i, dev->q_depth)) { + ret = -ENOMEM; break; + } + } - for (i = dev->online_queues; i <= dev->queue_count - 1; i++) - if (nvme_create_queue(dev->queues[i], i)) { + for (i = dev->online_queues; i <= dev->queue_count - 1; i++) { + ret = nvme_create_queue(dev->queues[i], i); + if (ret) { nvme_free_queues(dev, i); break; } -} - -static int set_queue_count(struct nvme_dev *dev, int count) -{ - int status; - u32 result; - u32 q_count = (count - 1) | ((count - 1) << 16); - - status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, - &result); - if (status < 0) - return status; - if (status > 0) { - dev_err(dev->dev, "Could not set queue count (%d)\n", status); - return 0; } - return min(result & 0xffff, result >> 16) + 1; + + /* + * Ignore failing Create SQ/CQ commands, we can continue with less + * than the desired aount of queues, and even a controller without + * I/O queues an still be used to issue admin commands. This might + * be useful to upgrade a buggy firmware for example. + */ + return ret >= 0 ? 0 : ret; } static void __iomem *nvme_map_cmb(struct nvme_dev *dev) @@ -2379,11 +1420,11 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) if (!use_cmb_sqes) return NULL; - dev->cmbsz = readl(&dev->bar->cmbsz); + dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); if (!(NVME_CMB_SZ(dev->cmbsz))) return NULL; - cmbloc = readl(&dev->bar->cmbloc); + cmbloc = readl(dev->bar + NVME_REG_CMBLOC); szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); size = szu * NVME_CMB_SZ(dev->cmbsz); @@ -2431,11 +1472,20 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) int result, i, vecs, nr_io_queues, size; nr_io_queues = num_possible_cpus(); - result = set_queue_count(dev, nr_io_queues); - if (result <= 0) + result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); + if (result < 0) return result; - if (result < nr_io_queues) - nr_io_queues = result; + + /* + * Degraded controllers might return an error when setting the queue + * count. We still want to be able to bring them online and offer + * access to the admin queue, as that might be only way to fix them up. + */ + if (result > 0) { + dev_err(dev->dev, "Could not set queue count (%d)\n", result); + nr_io_queues = 0; + result = 0; + } if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { result = nvme_cmb_qdepth(dev, nr_io_queues, @@ -2457,7 +1507,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) return -ENOMEM; size = db_bar_size(dev, nr_io_queues); } while (1); - dev->dbs = ((void __iomem *)dev->bar) + 4096; + dev->dbs = dev->bar + 4096; adminq->q_db = dev->dbs; } @@ -2501,115 +1551,115 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) /* Free previously allocated queues that are no longer usable */ nvme_free_queues(dev, nr_io_queues + 1); - nvme_create_io_queues(dev); - - return 0; + return nvme_create_io_queues(dev); free_queues: nvme_free_queues(dev, 1); return result; } -static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) +static void nvme_set_irq_hints(struct nvme_dev *dev) { - struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); - struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); + struct nvme_queue *nvmeq; + int i; - return nsa->ns_id - nsb->ns_id; -} + for (i = 0; i < dev->online_queues; i++) { + nvmeq = dev->queues[i]; -static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid) -{ - struct nvme_ns *ns; + if (!nvmeq->tags || !(*nvmeq->tags)) + continue; - list_for_each_entry(ns, &dev->namespaces, list) { - if (ns->ns_id == nsid) - return ns; - if (ns->ns_id > nsid) - break; + irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, + blk_mq_tags_cpumask(*nvmeq->tags)); } - return NULL; } -static inline bool nvme_io_incapable(struct nvme_dev *dev) +static void nvme_dev_scan(struct work_struct *work) { - return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS || - dev->online_queues < 2); + struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); + + if (!dev->tagset.tags) + return; + nvme_scan_namespaces(&dev->ctrl); + nvme_set_irq_hints(dev); } -static void nvme_ns_remove(struct nvme_ns *ns) +static void nvme_del_queue_end(struct request *req, int error) { - bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue); + struct nvme_queue *nvmeq = req->end_io_data; - if (kill) { - blk_set_queue_dying(ns->queue); - - /* - * The controller was shutdown first if we got here through - * device removal. The shutdown may requeue outstanding - * requests. These need to be aborted immediately so - * del_gendisk doesn't block indefinitely for their completion. - */ - blk_mq_abort_requeue_list(ns->queue); - } - if (ns->disk->flags & GENHD_FL_UP) - del_gendisk(ns->disk); - if (kill || !blk_queue_dying(ns->queue)) { - blk_mq_abort_requeue_list(ns->queue); - blk_cleanup_queue(ns->queue); - } - list_del_init(&ns->list); - kref_put(&ns->kref, nvme_free_ns); + blk_mq_free_request(req); + complete(&nvmeq->dev->ioq_wait); } -static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn) +static void nvme_del_cq_end(struct request *req, int error) { - struct nvme_ns *ns, *next; - unsigned i; + struct nvme_queue *nvmeq = req->end_io_data; - for (i = 1; i <= nn; i++) { - ns = nvme_find_ns(dev, i); - if (ns) { - if (revalidate_disk(ns->disk)) - nvme_ns_remove(ns); - } else - nvme_alloc_ns(dev, i); - } - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { - if (ns->ns_id > nn) - nvme_ns_remove(ns); + if (!error) { + unsigned long flags; + + spin_lock_irqsave(&nvmeq->q_lock, flags); + nvme_process_cq(nvmeq); + spin_unlock_irqrestore(&nvmeq->q_lock, flags); } - list_sort(NULL, &dev->namespaces, ns_cmp); + + nvme_del_queue_end(req, error); } -static void nvme_set_irq_hints(struct nvme_dev *dev) +static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) { - struct nvme_queue *nvmeq; - int i; + struct request_queue *q = nvmeq->dev->ctrl.admin_q; + struct request *req; + struct nvme_command cmd; - for (i = 0; i < dev->online_queues; i++) { - nvmeq = dev->queues[i]; + memset(&cmd, 0, sizeof(cmd)); + cmd.delete_queue.opcode = opcode; + cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); - if (!nvmeq->tags || !(*nvmeq->tags)) - continue; + req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); + if (IS_ERR(req)) + return PTR_ERR(req); - irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, - blk_mq_tags_cpumask(*nvmeq->tags)); - } + req->timeout = ADMIN_TIMEOUT; + req->end_io_data = nvmeq; + + blk_execute_rq_nowait(q, NULL, req, false, + opcode == nvme_admin_delete_cq ? + nvme_del_cq_end : nvme_del_queue_end); + return 0; } -static void nvme_dev_scan(struct work_struct *work) +static void nvme_disable_io_queues(struct nvme_dev *dev) { - struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); - struct nvme_id_ctrl *ctrl; + int pass; + unsigned long timeout; + u8 opcode = nvme_admin_delete_sq; - if (!dev->tagset.tags) - return; - if (nvme_identify_ctrl(dev, &ctrl)) - return; - nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn)); - kfree(ctrl); - nvme_set_irq_hints(dev); + for (pass = 0; pass < 2; pass++) { + int sent = 0, i = dev->queue_count - 1; + + reinit_completion(&dev->ioq_wait); + retry: + timeout = ADMIN_TIMEOUT; + for (; i > 0; i--) { + struct nvme_queue *nvmeq = dev->queues[i]; + + if (!pass) + nvme_suspend_queue(nvmeq); + if (nvme_delete_queue(nvmeq, opcode)) + break; + ++sent; + } + while (sent--) { + timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout); + if (timeout == 0) + return; + if (i) + goto retry; + } + opcode = nvme_admin_delete_cq; + } } /* @@ -2620,42 +1670,7 @@ static void nvme_dev_scan(struct work_struct *work) */ static int nvme_dev_add(struct nvme_dev *dev) { - struct pci_dev *pdev = to_pci_dev(dev->dev); - int res; - struct nvme_id_ctrl *ctrl; - int shift = NVME_CAP_MPSMIN(lo_hi_readq(&dev->bar->cap)) + 12; - - res = nvme_identify_ctrl(dev, &ctrl); - if (res) { - dev_err(dev->dev, "Identify Controller failed (%d)\n", res); - return -EIO; - } - - dev->oncs = le16_to_cpup(&ctrl->oncs); - dev->abort_limit = ctrl->acl + 1; - dev->vwc = ctrl->vwc; - memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); - memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); - memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); - if (ctrl->mdts) - dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); - else - dev->max_hw_sectors = UINT_MAX; - if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && - (pdev->device == 0x0953) && ctrl->vs[3]) { - unsigned int max_hw_sectors; - - dev->stripe_size = 1 << (ctrl->vs[3] + shift); - max_hw_sectors = dev->stripe_size >> (shift - 9); - if (dev->max_hw_sectors) { - dev->max_hw_sectors = min(max_hw_sectors, - dev->max_hw_sectors); - } else - dev->max_hw_sectors = max_hw_sectors; - } - kfree(ctrl); - - if (!dev->tagset.tags) { + if (!dev->ctrl.tagset) { dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; dev->tagset.timeout = NVME_IO_TIMEOUT; @@ -2668,8 +1683,9 @@ static int nvme_dev_add(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(&dev->tagset)) return 0; + dev->ctrl.tagset = &dev->tagset; } - schedule_work(&dev->scan_work); + queue_work(nvme_workq, &dev->scan_work); return 0; } @@ -2699,7 +1715,7 @@ static int nvme_dev_map(struct nvme_dev *dev) if (!dev->bar) goto disable; - if (readl(&dev->bar->csts) == -1) { + if (readl(dev->bar + NVME_REG_CSTS) == -1) { result = -ENODEV; goto unmap; } @@ -2714,10 +1730,11 @@ static int nvme_dev_map(struct nvme_dev *dev) goto unmap; } - cap = lo_hi_readq(&dev->bar->cap); + cap = lo_hi_readq(dev->bar + NVME_REG_CAP); + dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); dev->db_stride = 1 << NVME_CAP_STRIDE(cap); - dev->dbs = ((void __iomem *)dev->bar) + 4096; + dev->dbs = dev->bar + 4096; /* * Temporary fix for the Apple controller found in the MacBook8,1 and @@ -2730,9 +1747,11 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->q_depth); } - if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) + if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) dev->cmb = nvme_map_cmb(dev); + pci_enable_pcie_error_reporting(pdev); + pci_save_state(pdev); return 0; unmap: @@ -2760,152 +1779,34 @@ static void nvme_dev_unmap(struct nvme_dev *dev) pci_release_regions(pdev); } - if (pci_is_enabled(pdev)) + if (pci_is_enabled(pdev)) { + pci_disable_pcie_error_reporting(pdev); pci_disable_device(pdev); -} - -struct nvme_delq_ctx { - struct task_struct *waiter; - struct kthread_worker *worker; - atomic_t refcount; -}; - -static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) -{ - dq->waiter = current; - mb(); - - for (;;) { - set_current_state(TASK_KILLABLE); - if (!atomic_read(&dq->refcount)) - break; - if (!schedule_timeout(ADMIN_TIMEOUT) || - fatal_signal_pending(current)) { - /* - * Disable the controller first since we can't trust it - * at this point, but leave the admin queue enabled - * until all queue deletion requests are flushed. - * FIXME: This may take a while if there are more h/w - * queues than admin tags. - */ - set_current_state(TASK_RUNNING); - nvme_disable_ctrl(dev, lo_hi_readq(&dev->bar->cap)); - nvme_clear_queue(dev->queues[0]); - flush_kthread_worker(dq->worker); - nvme_disable_queue(dev, 0); - return; - } } - set_current_state(TASK_RUNNING); -} - -static void nvme_put_dq(struct nvme_delq_ctx *dq) -{ - atomic_dec(&dq->refcount); - if (dq->waiter) - wake_up_process(dq->waiter); -} - -static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) -{ - atomic_inc(&dq->refcount); - return dq; -} - -static void nvme_del_queue_end(struct nvme_queue *nvmeq) -{ - struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; - nvme_put_dq(dq); - - spin_lock_irq(&nvmeq->q_lock); - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); -} - -static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, - kthread_work_func_t fn) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.delete_queue.opcode = opcode; - c.delete_queue.qid = cpu_to_le16(nvmeq->qid); - - init_kthread_work(&nvmeq->cmdinfo.work, fn); - return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, - ADMIN_TIMEOUT); -} - -static void nvme_del_cq_work_handler(struct kthread_work *work) -{ - struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, - cmdinfo.work); - nvme_del_queue_end(nvmeq); -} - -static int nvme_delete_cq(struct nvme_queue *nvmeq) -{ - return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq, - nvme_del_cq_work_handler); -} - -static void nvme_del_sq_work_handler(struct kthread_work *work) -{ - struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, - cmdinfo.work); - int status = nvmeq->cmdinfo.status; - - if (!status) - status = nvme_delete_cq(nvmeq); - if (status) - nvme_del_queue_end(nvmeq); -} - -static int nvme_delete_sq(struct nvme_queue *nvmeq) -{ - return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq, - nvme_del_sq_work_handler); } -static void nvme_del_queue_start(struct kthread_work *work) +static int nvme_dev_list_add(struct nvme_dev *dev) { - struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, - cmdinfo.work); - if (nvme_delete_sq(nvmeq)) - nvme_del_queue_end(nvmeq); -} + bool start_thread = false; -static void nvme_disable_io_queues(struct nvme_dev *dev) -{ - int i; - DEFINE_KTHREAD_WORKER_ONSTACK(worker); - struct nvme_delq_ctx dq; - struct task_struct *kworker_task = kthread_run(kthread_worker_fn, - &worker, "nvme%d", dev->instance); - - if (IS_ERR(kworker_task)) { - dev_err(dev->dev, - "Failed to create queue del task\n"); - for (i = dev->queue_count - 1; i > 0; i--) - nvme_disable_queue(dev, i); - return; + spin_lock(&dev_list_lock); + if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { + start_thread = true; + nvme_thread = NULL; } + list_add(&dev->node, &dev_list); + spin_unlock(&dev_list_lock); - dq.waiter = NULL; - atomic_set(&dq.refcount, 0); - dq.worker = &worker; - for (i = dev->queue_count - 1; i > 0; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; + if (start_thread) { + nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); + wake_up_all(&nvme_kthread_wait); + } else + wait_event_killable(nvme_kthread_wait, nvme_thread); - if (nvme_suspend_queue(nvmeq)) - continue; - nvmeq->cmdinfo.ctx = nvme_get_dq(&dq); - nvmeq->cmdinfo.worker = dq.worker; - init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start); - queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work); - } - nvme_wait_dq(&dq, dev); - kthread_stop(kworker_task); + if (IS_ERR_OR_NULL(nvme_thread)) + return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; + + return 0; } /* @@ -2928,44 +1829,17 @@ static void nvme_dev_list_remove(struct nvme_dev *dev) kthread_stop(tmp); } -static void nvme_freeze_queues(struct nvme_dev *dev) -{ - struct nvme_ns *ns; - - list_for_each_entry(ns, &dev->namespaces, list) { - blk_mq_freeze_queue_start(ns->queue); - - spin_lock_irq(ns->queue->queue_lock); - queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); - spin_unlock_irq(ns->queue->queue_lock); - - blk_mq_cancel_requeue_work(ns->queue); - blk_mq_stop_hw_queues(ns->queue); - } -} - -static void nvme_unfreeze_queues(struct nvme_dev *dev) -{ - struct nvme_ns *ns; - - list_for_each_entry(ns, &dev->namespaces, list) { - queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); - blk_mq_unfreeze_queue(ns->queue); - blk_mq_start_stopped_hw_queues(ns->queue, true); - blk_mq_kick_requeue_list(ns->queue); - } -} - -static void nvme_dev_shutdown(struct nvme_dev *dev) +static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { int i; u32 csts = -1; nvme_dev_list_remove(dev); + mutex_lock(&dev->shutdown_lock); if (dev->bar) { - nvme_freeze_queues(dev); - csts = readl(&dev->bar->csts); + nvme_stop_queues(&dev->ctrl); + csts = readl(dev->bar + NVME_REG_CSTS); } if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { for (i = dev->queue_count - 1; i >= 0; i--) { @@ -2974,30 +1848,13 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) } } else { nvme_disable_io_queues(dev); - nvme_shutdown_ctrl(dev); - nvme_disable_queue(dev, 0); + nvme_disable_admin_queue(dev, shutdown); } nvme_dev_unmap(dev); for (i = dev->queue_count - 1; i >= 0; i--) nvme_clear_queue(dev->queues[i]); -} - -static void nvme_dev_remove(struct nvme_dev *dev) -{ - struct nvme_ns *ns, *next; - - if (nvme_io_incapable(dev)) { - /* - * If the device is not capable of IO (surprise hot-removal, - * for example), we need to quiesce prior to deleting the - * namespaces. This will end outstanding requests and prevent - * attempts to sync dirty data. - */ - nvme_dev_shutdown(dev); - } - list_for_each_entry_safe(ns, next, &dev->namespaces, list) - nvme_ns_remove(ns); + mutex_unlock(&dev->shutdown_lock); } static int nvme_setup_prp_pools(struct nvme_dev *dev) @@ -3023,119 +1880,36 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) dma_pool_destroy(dev->prp_small_pool); } -static DEFINE_IDA(nvme_instance_ida); - -static int nvme_set_instance(struct nvme_dev *dev) -{ - int instance, error; - - do { - if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) - return -ENODEV; - - spin_lock(&dev_list_lock); - error = ida_get_new(&nvme_instance_ida, &instance); - spin_unlock(&dev_list_lock); - } while (error == -EAGAIN); - - if (error) - return -ENODEV; - - dev->instance = instance; - return 0; -} - -static void nvme_release_instance(struct nvme_dev *dev) +static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) { - spin_lock(&dev_list_lock); - ida_remove(&nvme_instance_ida, dev->instance); - spin_unlock(&dev_list_lock); -} - -static void nvme_free_dev(struct kref *kref) -{ - struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); + struct nvme_dev *dev = to_nvme_dev(ctrl); put_device(dev->dev); - put_device(dev->device); - nvme_release_instance(dev); if (dev->tagset.tags) blk_mq_free_tag_set(&dev->tagset); - if (dev->admin_q) - blk_put_queue(dev->admin_q); + if (dev->ctrl.admin_q) + blk_put_queue(dev->ctrl.admin_q); kfree(dev->queues); kfree(dev->entry); kfree(dev); } -static int nvme_dev_open(struct inode *inode, struct file *f) +static void nvme_reset_work(struct work_struct *work) { - struct nvme_dev *dev; - int instance = iminor(inode); - int ret = -ENODEV; - - spin_lock(&dev_list_lock); - list_for_each_entry(dev, &dev_list, node) { - if (dev->instance == instance) { - if (!dev->admin_q) { - ret = -EWOULDBLOCK; - break; - } - if (!kref_get_unless_zero(&dev->kref)) - break; - f->private_data = dev; - ret = 0; - break; - } - } - spin_unlock(&dev_list_lock); - - return ret; -} + struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); + int result; -static int nvme_dev_release(struct inode *inode, struct file *f) -{ - struct nvme_dev *dev = f->private_data; - kref_put(&dev->kref, nvme_free_dev); - return 0; -} + if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags))) + goto out; -static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) -{ - struct nvme_dev *dev = f->private_data; - struct nvme_ns *ns; - - switch (cmd) { - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(dev, NULL, (void __user *)arg); - case NVME_IOCTL_IO_CMD: - if (list_empty(&dev->namespaces)) - return -ENOTTY; - ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); - return nvme_user_cmd(dev, ns, (void __user *)arg); - case NVME_IOCTL_RESET: - dev_warn(dev->dev, "resetting controller\n"); - return nvme_reset(dev); - case NVME_IOCTL_SUBSYS_RESET: - return nvme_subsys_reset(dev); - default: - return -ENOTTY; - } -} + /* + * If we're called to reset a live controller first shut it down before + * moving on. + */ + if (dev->bar) + nvme_dev_disable(dev, false); -static const struct file_operations nvme_dev_fops = { - .owner = THIS_MODULE, - .open = nvme_dev_open, - .release = nvme_dev_release, - .unlocked_ioctl = nvme_dev_ioctl, - .compat_ioctl = nvme_dev_ioctl, -}; - -static void nvme_probe_work(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); - bool start_thread = false; - int result; + set_bit(NVME_CTRL_RESETTING, &dev->flags); result = nvme_dev_map(dev); if (result) @@ -3145,35 +1919,24 @@ static void nvme_probe_work(struct work_struct *work) if (result) goto unmap; - spin_lock(&dev_list_lock); - if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { - start_thread = true; - nvme_thread = NULL; - } - list_add(&dev->node, &dev_list); - spin_unlock(&dev_list_lock); - - if (start_thread) { - nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); - wake_up_all(&nvme_kthread_wait); - } else - wait_event_killable(nvme_kthread_wait, nvme_thread); - - if (IS_ERR_OR_NULL(nvme_thread)) { - result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; - goto disable; - } - nvme_init_queue(dev->queues[0], 0); result = nvme_alloc_admin_tags(dev); if (result) goto disable; + result = nvme_init_identify(&dev->ctrl); + if (result) + goto free_tags; + result = nvme_setup_io_queues(dev); if (result) goto free_tags; - dev->event_limit = 1; + dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; + + result = nvme_dev_list_add(dev); + if (result) + goto remove; /* * Keep the controller around but remove all namespaces if we don't have @@ -3181,117 +1944,98 @@ static void nvme_probe_work(struct work_struct *work) */ if (dev->online_queues < 2) { dev_warn(dev->dev, "IO queues not created\n"); - nvme_dev_remove(dev); + nvme_remove_namespaces(&dev->ctrl); } else { - nvme_unfreeze_queues(dev); + nvme_start_queues(&dev->ctrl); nvme_dev_add(dev); } + clear_bit(NVME_CTRL_RESETTING, &dev->flags); return; + remove: + nvme_dev_list_remove(dev); free_tags: nvme_dev_remove_admin(dev); - blk_put_queue(dev->admin_q); - dev->admin_q = NULL; + blk_put_queue(dev->ctrl.admin_q); + dev->ctrl.admin_q = NULL; dev->queues[0]->tags = NULL; disable: - nvme_disable_queue(dev, 0); - nvme_dev_list_remove(dev); + nvme_disable_admin_queue(dev, false); unmap: nvme_dev_unmap(dev); out: - if (!work_busy(&dev->reset_work)) - nvme_dead_ctrl(dev); + nvme_remove_dead_ctrl(dev); } -static int nvme_remove_dead_ctrl(void *arg) +static void nvme_remove_dead_ctrl_work(struct work_struct *work) { - struct nvme_dev *dev = (struct nvme_dev *)arg; + struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); struct pci_dev *pdev = to_pci_dev(dev->dev); if (pci_get_drvdata(pdev)) pci_stop_and_remove_bus_device_locked(pdev); - kref_put(&dev->kref, nvme_free_dev); - return 0; + nvme_put_ctrl(&dev->ctrl); } -static void nvme_dead_ctrl(struct nvme_dev *dev) +static void nvme_remove_dead_ctrl(struct nvme_dev *dev) { - dev_warn(dev->dev, "Device failed to resume\n"); - kref_get(&dev->kref); - if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", - dev->instance))) { - dev_err(dev->dev, - "Failed to start controller remove task\n"); - kref_put(&dev->kref, nvme_free_dev); - } + dev_warn(dev->dev, "Removing after probe failure\n"); + kref_get(&dev->ctrl.kref); + if (!schedule_work(&dev->remove_work)) + nvme_put_ctrl(&dev->ctrl); } -static void nvme_reset_work(struct work_struct *ws) +static int nvme_reset(struct nvme_dev *dev) { - struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); - bool in_probe = work_busy(&dev->probe_work); - - nvme_dev_shutdown(dev); + if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) + return -ENODEV; - /* Synchronize with device probe so that work will see failure status - * and exit gracefully without trying to schedule another reset */ - flush_work(&dev->probe_work); + if (!queue_work(nvme_workq, &dev->reset_work)) + return -EBUSY; - /* Fail this device if reset occured during probe to avoid - * infinite initialization loops. */ - if (in_probe) { - nvme_dead_ctrl(dev); - return; - } - /* Schedule device resume asynchronously so the reset work is available - * to cleanup errors that may occur during reinitialization */ - schedule_work(&dev->probe_work); + flush_work(&dev->reset_work); + return 0; } -static int __nvme_reset(struct nvme_dev *dev) +static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) { - if (work_pending(&dev->reset_work)) - return -EBUSY; - list_del_init(&dev->node); - queue_work(nvme_workq, &dev->reset_work); + *val = readl(to_nvme_dev(ctrl)->bar + off); return 0; } -static int nvme_reset(struct nvme_dev *dev) +static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) { - int ret; - - if (!dev->admin_q || blk_queue_dying(dev->admin_q)) - return -ENODEV; - - spin_lock(&dev_list_lock); - ret = __nvme_reset(dev); - spin_unlock(&dev_list_lock); - - if (!ret) { - flush_work(&dev->reset_work); - flush_work(&dev->probe_work); - return 0; - } + writel(val, to_nvme_dev(ctrl)->bar + off); + return 0; +} - return ret; +static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) +{ + *val = readq(to_nvme_dev(ctrl)->bar + off); + return 0; } -static ssize_t nvme_sysfs_reset(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) +static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl) { - struct nvme_dev *ndev = dev_get_drvdata(dev); - int ret; + struct nvme_dev *dev = to_nvme_dev(ctrl); - ret = nvme_reset(ndev); - if (ret < 0) - return ret; + return !dev->bar || dev->online_queues < 2; +} - return count; +static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) +{ + return nvme_reset(to_nvme_dev(ctrl)); } -static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); + +static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { + .reg_read32 = nvme_pci_reg_read32, + .reg_write32 = nvme_pci_reg_write32, + .reg_read64 = nvme_pci_reg_read64, + .io_incapable = nvme_pci_io_incapable, + .reset_ctrl = nvme_pci_reset_ctrl, + .free_ctrl = nvme_pci_free_ctrl, +}; static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -3314,46 +2058,30 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!dev->queues) goto free; - INIT_LIST_HEAD(&dev->namespaces); - INIT_WORK(&dev->reset_work, nvme_reset_work); dev->dev = get_device(&pdev->dev); pci_set_drvdata(pdev, dev); - result = nvme_set_instance(dev); - if (result) - goto put_pci; + + INIT_LIST_HEAD(&dev->node); + INIT_WORK(&dev->scan_work, nvme_dev_scan); + INIT_WORK(&dev->reset_work, nvme_reset_work); + INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); + mutex_init(&dev->shutdown_lock); + init_completion(&dev->ioq_wait); result = nvme_setup_prp_pools(dev); if (result) - goto release; - - kref_init(&dev->kref); - dev->device = device_create(nvme_class, &pdev->dev, - MKDEV(nvme_char_major, dev->instance), - dev, "nvme%d", dev->instance); - if (IS_ERR(dev->device)) { - result = PTR_ERR(dev->device); - goto release_pools; - } - get_device(dev->device); - dev_set_drvdata(dev->device, dev); + goto put_pci; - result = device_create_file(dev->device, &dev_attr_reset_controller); + result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, + id->driver_data); if (result) - goto put_dev; + goto release_pools; - INIT_LIST_HEAD(&dev->node); - INIT_WORK(&dev->scan_work, nvme_dev_scan); - INIT_WORK(&dev->probe_work, nvme_probe_work); - schedule_work(&dev->probe_work); + queue_work(nvme_workq, &dev->reset_work); return 0; - put_dev: - device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); - put_device(dev->device); release_pools: nvme_release_prp_pools(dev); - release: - nvme_release_instance(dev); put_pci: put_device(dev->dev); free: @@ -3368,15 +2096,15 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) struct nvme_dev *dev = pci_get_drvdata(pdev); if (prepare) - nvme_dev_shutdown(dev); + nvme_dev_disable(dev, false); else - schedule_work(&dev->probe_work); + queue_work(nvme_workq, &dev->reset_work); } static void nvme_shutdown(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_dev_shutdown(dev); + nvme_dev_disable(dev, true); } static void nvme_remove(struct pci_dev *pdev) @@ -3388,34 +2116,25 @@ static void nvme_remove(struct pci_dev *pdev) spin_unlock(&dev_list_lock); pci_set_drvdata(pdev, NULL); - flush_work(&dev->probe_work); flush_work(&dev->reset_work); flush_work(&dev->scan_work); - device_remove_file(dev->device, &dev_attr_reset_controller); - nvme_dev_remove(dev); - nvme_dev_shutdown(dev); + nvme_remove_namespaces(&dev->ctrl); + nvme_uninit_ctrl(&dev->ctrl); + nvme_dev_disable(dev, true); nvme_dev_remove_admin(dev); - device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); nvme_free_queues(dev, 0); nvme_release_cmb(dev); nvme_release_prp_pools(dev); - kref_put(&dev->kref, nvme_free_dev); + nvme_put_ctrl(&dev->ctrl); } -/* These functions are yet to be implemented */ -#define nvme_error_detected NULL -#define nvme_dump_registers NULL -#define nvme_link_reset NULL -#define nvme_slot_reset NULL -#define nvme_error_resume NULL - #ifdef CONFIG_PM_SLEEP static int nvme_suspend(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); - nvme_dev_shutdown(ndev); + nvme_dev_disable(ndev, true); return 0; } @@ -3424,17 +2143,53 @@ static int nvme_resume(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); - schedule_work(&ndev->probe_work); + queue_work(nvme_workq, &ndev->reset_work); return 0; } #endif static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); +static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + /* + * A frozen channel requires a reset. When detected, this method will + * shutdown the controller to quiesce. The controller will be restarted + * after the slot reset through driver's slot_reset callback. + */ + dev_warn(&pdev->dev, "error detected: state:%d\n", state); + switch (state) { + case pci_channel_io_normal: + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + nvme_dev_disable(dev, false); + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + return PCI_ERS_RESULT_DISCONNECT; + } + return PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + dev_info(&pdev->dev, "restart after slot reset\n"); + pci_restore_state(pdev); + queue_work(nvme_workq, &dev->reset_work); + return PCI_ERS_RESULT_RECOVERED; +} + +static void nvme_error_resume(struct pci_dev *pdev) +{ + pci_cleanup_aer_uncorrect_error_status(pdev); +} + static const struct pci_error_handlers nvme_err_handler = { .error_detected = nvme_error_detected, - .mmio_enabled = nvme_dump_registers, - .link_reset = nvme_link_reset, .slot_reset = nvme_slot_reset, .resume = nvme_error_resume, .reset_notify = nvme_reset_notify, @@ -3444,6 +2199,10 @@ static const struct pci_error_handlers nvme_err_handler = { #define PCI_CLASS_STORAGE_EXPRESS 0x010802 static const struct pci_device_id nvme_id_table[] = { + { PCI_VDEVICE(INTEL, 0x0953), + .driver_data = NVME_QUIRK_STRIPE_SIZE, }, + { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ + .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { 0, } @@ -3468,40 +2227,21 @@ static int __init nvme_init(void) init_waitqueue_head(&nvme_kthread_wait); - nvme_workq = create_singlethread_workqueue("nvme"); + nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0); if (!nvme_workq) return -ENOMEM; - result = register_blkdev(nvme_major, "nvme"); + result = nvme_core_init(); if (result < 0) goto kill_workq; - else if (result > 0) - nvme_major = result; - - result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", - &nvme_dev_fops); - if (result < 0) - goto unregister_blkdev; - else if (result > 0) - nvme_char_major = result; - - nvme_class = class_create(THIS_MODULE, "nvme"); - if (IS_ERR(nvme_class)) { - result = PTR_ERR(nvme_class); - goto unregister_chrdev; - } result = pci_register_driver(&nvme_driver); if (result) - goto destroy_class; + goto core_exit; return 0; - destroy_class: - class_destroy(nvme_class); - unregister_chrdev: - __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); - unregister_blkdev: - unregister_blkdev(nvme_major, "nvme"); + core_exit: + nvme_core_exit(); kill_workq: destroy_workqueue(nvme_workq); return result; @@ -3510,10 +2250,8 @@ static int __init nvme_init(void) static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); - unregister_blkdev(nvme_major, "nvme"); + nvme_core_exit(); destroy_workqueue(nvme_workq); - class_destroy(nvme_class); - __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); _nvme_check_size(); } diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index c3d8d3887a31..e947e298a737 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -524,7 +524,7 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) { - struct nvme_dev *dev = ns->dev; + struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_id_ns *id_ns; int res; int nvme_sc; @@ -532,10 +532,10 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, u8 resp_data_format = 0x02; u8 protect; u8 cmdque = 0x01 << 1; - u8 fw_offset = sizeof(dev->firmware_rev); + u8 fw_offset = sizeof(ctrl->firmware_rev); /* nvme ns identify - use DPS value for PROTECT field */ - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -553,12 +553,12 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */ inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ strncpy(&inq_response[8], "NVMe ", 8); - strncpy(&inq_response[16], dev->model, 16); + strncpy(&inq_response[16], ctrl->model, 16); - while (dev->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) + while (ctrl->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) fw_offset--; fw_offset -= 4; - strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4); + strncpy(&inq_response[32], ctrl->firmware_rev + fw_offset, 4); xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); @@ -588,82 +588,113 @@ static int nvme_trans_unit_serial_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) { - struct nvme_dev *dev = ns->dev; int xfer_len; memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */ inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */ - strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH); + strncpy(&inq_response[4], ns->ctrl->serial, INQ_SERIAL_NUMBER_LENGTH); xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); } -static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *inq_response, int alloc_len) +static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *inq_response, int alloc_len) { - struct nvme_dev *dev = ns->dev; - int res; - int nvme_sc; - int xfer_len; - __be32 tmp_id = cpu_to_be32(ns->ns_id); + struct nvme_id_ns *id_ns; + int nvme_sc, res; + size_t len; + void *eui; - memset(inq_response, 0, alloc_len); - inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ - if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) { - struct nvme_id_ns *id_ns; - void *eui; - int len; + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; + eui = id_ns->eui64; + len = sizeof(id_ns->eui64); - eui = id_ns->eui64; - len = sizeof(id_ns->eui64); - if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) { - if (bitmap_empty(eui, len * 8)) { - eui = id_ns->nguid; - len = sizeof(id_ns->nguid); - } - } + if (ns->ctrl->vs >= NVME_VS(1, 2)) { if (bitmap_empty(eui, len * 8)) { - kfree(id_ns); - goto scsi_string; + eui = id_ns->nguid; + len = sizeof(id_ns->nguid); } + } - inq_response[3] = 4 + len; /* Page Length */ - /* Designation Descriptor start */ - inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ - inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = len; /* Designator Length */ - memcpy(&inq_response[8], eui, len); - kfree(id_ns); - } else { - scsi_string: - if (alloc_len < 72) { - return nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - inq_response[3] = 0x48; /* Page Length */ - /* Designation Descriptor start */ - inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ - inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = 0x44; /* Designator Length */ - - sprintf(&inq_response[8], "%04x", to_pci_dev(dev->dev)->vendor); - memcpy(&inq_response[12], dev->model, sizeof(dev->model)); - sprintf(&inq_response[52], "%04x", tmp_id); - memcpy(&inq_response[56], dev->serial, sizeof(dev->serial)); + if (bitmap_empty(eui, len * 8)) { + res = -EOPNOTSUPP; + goto out_free_id; } - xfer_len = alloc_len; - return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + memset(inq_response, 0, alloc_len); + inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; + inq_response[3] = 4 + len; /* Page Length */ + + /* Designation Descriptor start */ + inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ + inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = len; /* Designator Length */ + memcpy(&inq_response[8], eui, len); + + res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); +out_free_id: + kfree(id_ns); + return res; +} + +static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + struct nvme_id_ctrl *id_ctrl; + int nvme_sc, res; + + if (alloc_len < 72) { + return nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + + nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + + memset(inq_response, 0, alloc_len); + inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; + inq_response[3] = 0x48; /* Page Length */ + + /* Designation Descriptor start */ + inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ + inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = 0x44; /* Designator Length */ + + sprintf(&inq_response[8], "%04x", le16_to_cpu(id_ctrl->vid)); + memcpy(&inq_response[12], ctrl->model, sizeof(ctrl->model)); + sprintf(&inq_response[52], "%04x", cpu_to_be32(ns->ns_id)); + memcpy(&inq_response[56], ctrl->serial, sizeof(ctrl->serial)); + + res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); + kfree(id_ctrl); + return res; +} + +static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *resp, int alloc_len) +{ + int res; + + if (ns->ctrl->vs >= NVME_VS(1, 1)) { + res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len); + if (res != -EOPNOTSUPP) + return res; + } + + return nvme_fill_device_id_scsi_string(ns, hdr, resp, alloc_len); } static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, @@ -672,7 +703,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *inq_response; int res; int nvme_sc; - struct nvme_dev *dev = ns->dev; + struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_id_ctrl *id_ctrl; struct nvme_id_ns *id_ns; int xfer_len; @@ -688,7 +719,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, if (inq_response == NULL) return -ENOMEM; - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) goto out_free_inq; @@ -704,7 +735,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, app_chk = protect << 1; ref_chk = protect; - nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); + nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); res = nvme_trans_status_code(hdr, nvme_sc); if (res) goto out_free_inq; @@ -815,7 +846,6 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, int res; int xfer_len; u8 *log_response; - struct nvme_dev *dev = ns->dev; struct nvme_smart_log *smart_log; u8 temp_c; u16 temp_k; @@ -824,7 +854,7 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, if (log_response == NULL) return -ENOMEM; - res = nvme_get_log_page(dev, &smart_log); + res = nvme_get_log_page(ns->ctrl, &smart_log); if (res < 0) goto out_free_response; @@ -862,7 +892,6 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, int res; int xfer_len; u8 *log_response; - struct nvme_dev *dev = ns->dev; struct nvme_smart_log *smart_log; u32 feature_resp; u8 temp_c_cur, temp_c_thresh; @@ -872,7 +901,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, if (log_response == NULL) return -ENOMEM; - res = nvme_get_log_page(dev, &smart_log); + res = nvme_get_log_page(ns->ctrl, &smart_log); if (res < 0) goto out_free_response; @@ -886,7 +915,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, kfree(smart_log); /* Get Features for Temp Threshold */ - res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0, + res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, 0, &feature_resp); if (res != NVME_SC_SUCCESS) temp_c_thresh = LOG_TEMP_UNKNOWN; @@ -948,7 +977,6 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res; int nvme_sc; - struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id_ns; u8 flbas; u32 lba_length; @@ -958,7 +986,7 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN) return -EINVAL; - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -1014,14 +1042,13 @@ static int nvme_trans_fill_caching_page(struct nvme_ns *ns, { int res = 0; int nvme_sc; - struct nvme_dev *dev = ns->dev; u32 feature_resp; u8 vwc; if (len < MODE_PAGE_CACHING_LEN) return -EINVAL; - nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0, + nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, 0, &feature_resp); res = nvme_trans_status_code(hdr, nvme_sc); if (res) @@ -1207,12 +1234,11 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res; int nvme_sc; - struct nvme_dev *dev = ns->dev; struct nvme_id_ctrl *id_ctrl; int lowest_pow_st; /* max npss = lowest power consumption */ unsigned ps_desired = 0; - nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); + nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -1256,7 +1282,7 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); break; } - nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0, + nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_POWER_MGMT, ps_desired, 0, NULL); return nvme_trans_status_code(hdr, nvme_sc); } @@ -1280,7 +1306,6 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr u8 buffer_id) { int nvme_sc; - struct nvme_dev *dev = ns->dev; struct nvme_command c; if (hdr->iovec_count > 0) { @@ -1297,7 +1322,7 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); - nvme_sc = __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, + nvme_sc = nvme_submit_user_cmd(ns->ctrl->admin_q, &c, hdr->dxferp, tot_len, NULL, 0); return nvme_trans_status_code(hdr, nvme_sc); } @@ -1364,14 +1389,13 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res = 0; int nvme_sc; - struct nvme_dev *dev = ns->dev; unsigned dword11; switch (page_code) { case MODE_PAGE_CACHING: dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); - nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11, - 0, NULL); + nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, + dword11, 0, NULL); res = nvme_trans_status_code(hdr, nvme_sc); break; case MODE_PAGE_CONTROL: @@ -1473,7 +1497,6 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, { int res = 0; int nvme_sc; - struct nvme_dev *dev = ns->dev; u8 flbas; /* @@ -1486,7 +1509,7 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) { struct nvme_id_ns *id_ns; - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -1570,7 +1593,6 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res; int nvme_sc; - struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id_ns; u8 i; u8 flbas, nlbaf; @@ -1579,7 +1601,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, struct nvme_command c; /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */ - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -1611,7 +1633,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.format.nsid = cpu_to_le32(ns->ns_id); c.format.cdw10 = cpu_to_le32(cdw10); - nvme_sc = nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); + nvme_sc = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, 0); res = nvme_trans_status_code(hdr, nvme_sc); kfree(id_ns); @@ -1704,7 +1726,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, nvme_sc = NVME_SC_LBA_RANGE; break; } - nvme_sc = __nvme_submit_sync_cmd(ns->queue, &c, NULL, + nvme_sc = nvme_submit_user_cmd(ns->queue, &c, next_mapping_addr, unit_len, NULL, 0); if (nvme_sc) break; @@ -2040,7 +2062,6 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, u32 alloc_len; u32 resp_size; u32 xfer_len; - struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id_ns; u8 *response; @@ -2052,7 +2073,7 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, resp_size = READ_CAP_10_RESP_SIZE; } - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -2080,7 +2101,6 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, int nvme_sc; u32 alloc_len, xfer_len, resp_size; u8 *response; - struct nvme_dev *dev = ns->dev; struct nvme_id_ctrl *id_ctrl; u32 ll_length, lun_id; u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; @@ -2094,7 +2114,7 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, case ALL_LUNS_RETURNED: case ALL_WELL_KNOWN_LUNS_RETURNED: case RESTRICTED_LUNS_RETURNED: - nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); + nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -2295,9 +2315,7 @@ static int nvme_trans_test_unit_ready(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - struct nvme_dev *dev = ns->dev; - - if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) + if (nvme_ctrl_ready(ns->ctrl)) return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, NOT_READY, SCSI_ASC_LUN_NOT_READY, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index e77d15000caa..5a2899f9f50b 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -615,9 +615,9 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio) } bip = bio_integrity_alloc(bio, GFP_NOIO, cmd->t_prot_nents); - if (!bip) { + if (IS_ERR(bip)) { pr_err("Unable to allocate bio_integrity_payload\n"); - return -ENOMEM; + return PTR_ERR(bip); } bip->bip_iter.bi_size = (cmd->data_length / dev->dev_attrib.block_size) * diff --git a/include/linux/aer.h b/include/linux/aer.h index 744b997d6a94..164049357e5c 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -7,6 +7,7 @@ #ifndef _AER_H_ #define _AER_H_ +#include <linux/errno.h> #include <linux/types.h> #define AER_NONFATAL 0 diff --git a/include/linux/bio.h b/include/linux/bio.h index b9b6e046b52e..5349e6816cbb 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -318,16 +318,6 @@ enum bip_flags { BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ }; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) -{ - if (bio->bi_rw & REQ_INTEGRITY) - return bio->bi_integrity; - - return NULL; -} - /* * bio integrity payload */ @@ -349,6 +339,16 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ }; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) +{ + if (bio->bi_rw & REQ_INTEGRITY) + return bio->bi_integrity; + + return NULL; +} + static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { struct bio_integrity_payload *bip = bio_integrity(bio); @@ -795,6 +795,18 @@ static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) return false; } +static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp, + unsigned int nr) +{ + return ERR_PTR(-EINVAL); +} + +static inline int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + return 0; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ #endif /* CONFIG_BLOCK */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0fb65843ec1e..86a38ea1823f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -188,7 +188,6 @@ enum rq_flag_bits { __REQ_PM, /* runtime pm request */ __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ - __REQ_NO_TIMEOUT, /* requests may never expire */ __REQ_NR_BITS, /* stops here */ }; @@ -242,7 +241,6 @@ enum rq_flag_bits { #define REQ_PM (1ULL << __REQ_PM) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) -#define REQ_NO_TIMEOUT (1ULL << __REQ_NO_TIMEOUT) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d372ea87ead5..29189aeace19 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -409,6 +409,7 @@ struct request_queue { unsigned int rq_timeout; struct timer_list timeout; + struct work_struct timeout_work; struct list_head timeout_list; struct list_head icq_list; diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 8723f2a99e15..d6b3c9943a2c 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -25,7 +25,6 @@ */ #ifndef DRBD_H #define DRBD_H -#include <linux/connector.h> #include <asm/types.h> #ifdef __KERNEL__ @@ -52,7 +51,7 @@ #endif extern const char *drbd_buildtag(void); -#define REL_VERSION "8.4.5" +#define REL_VERSION "8.4.6" #define API_VERSION 1 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 101 @@ -339,6 +338,8 @@ enum drbd_state_rv { #define MDF_AL_CLEAN (1 << 7) #define MDF_AL_DISABLED (1 << 8) +#define MAX_PEERS 32 + enum drbd_uuid_index { UI_CURRENT, UI_BITMAP, @@ -349,14 +350,35 @@ enum drbd_uuid_index { UI_EXTENDED_SIZE /* Everything. */ }; +#define HISTORY_UUIDS MAX_PEERS + enum drbd_timeout_flag { UT_DEFAULT = 0, UT_DEGRADED = 1, UT_PEER_OUTDATED = 2, }; +enum drbd_notification_type { + NOTIFY_EXISTS, + NOTIFY_CREATE, + NOTIFY_CHANGE, + NOTIFY_DESTROY, + NOTIFY_CALL, + NOTIFY_RESPONSE, + + NOTIFY_CONTINUES = 0x8000, + NOTIFY_FLAGS = NOTIFY_CONTINUES, +}; + #define UUID_JUST_CREATED ((__u64)4) +enum write_ordering_e { + WO_NONE, + WO_DRAIN_IO, + WO_BDEV_FLUSH, + WO_BIO_BARRIER +}; + /* magic numbers used in meta data and network packets */ #define DRBD_MAGIC 0x83740267 #define DRBD_MAGIC_BIG 0x835a diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 7b131ed8f9c6..2d0e5ad5de9d 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -250,6 +250,76 @@ GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms, __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach) ) +GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info, + __u32_field(1, 0, res_role) + __flg_field(2, 0, res_susp) + __flg_field(3, 0, res_susp_nod) + __flg_field(4, 0, res_susp_fen) + /* __flg_field(5, 0, res_weak) */ +) + +GENL_struct(DRBD_NLA_DEVICE_INFO, 16, device_info, + __u32_field(1, 0, dev_disk_state) +) + +GENL_struct(DRBD_NLA_CONNECTION_INFO, 17, connection_info, + __u32_field(1, 0, conn_connection_state) + __u32_field(2, 0, conn_role) +) + +GENL_struct(DRBD_NLA_PEER_DEVICE_INFO, 18, peer_device_info, + __u32_field(1, 0, peer_repl_state) + __u32_field(2, 0, peer_disk_state) + __u32_field(3, 0, peer_resync_susp_user) + __u32_field(4, 0, peer_resync_susp_peer) + __u32_field(5, 0, peer_resync_susp_dependency) +) + +GENL_struct(DRBD_NLA_RESOURCE_STATISTICS, 19, resource_statistics, + __u32_field(1, 0, res_stat_write_ordering) +) + +GENL_struct(DRBD_NLA_DEVICE_STATISTICS, 20, device_statistics, + __u64_field(1, 0, dev_size) /* (sectors) */ + __u64_field(2, 0, dev_read) /* (sectors) */ + __u64_field(3, 0, dev_write) /* (sectors) */ + __u64_field(4, 0, dev_al_writes) /* activity log writes (count) */ + __u64_field(5, 0, dev_bm_writes) /* bitmap writes (count) */ + __u32_field(6, 0, dev_upper_pending) /* application requests in progress */ + __u32_field(7, 0, dev_lower_pending) /* backing device requests in progress */ + __flg_field(8, 0, dev_upper_blocked) + __flg_field(9, 0, dev_lower_blocked) + __flg_field(10, 0, dev_al_suspended) /* activity log suspended */ + __u64_field(11, 0, dev_exposed_data_uuid) + __u64_field(12, 0, dev_current_uuid) + __u32_field(13, 0, dev_disk_flags) + __bin_field(14, 0, history_uuids, HISTORY_UUIDS * sizeof(__u64)) +) + +GENL_struct(DRBD_NLA_CONNECTION_STATISTICS, 21, connection_statistics, + __flg_field(1, 0, conn_congested) +) + +GENL_struct(DRBD_NLA_PEER_DEVICE_STATISTICS, 22, peer_device_statistics, + __u64_field(1, 0, peer_dev_received) /* sectors */ + __u64_field(2, 0, peer_dev_sent) /* sectors */ + __u32_field(3, 0, peer_dev_pending) /* number of requests */ + __u32_field(4, 0, peer_dev_unacked) /* number of requests */ + __u64_field(5, 0, peer_dev_out_of_sync) /* sectors */ + __u64_field(6, 0, peer_dev_resync_failed) /* sectors */ + __u64_field(7, 0, peer_dev_bitmap_uuid) + __u32_field(9, 0, peer_dev_flags) +) + +GENL_struct(DRBD_NLA_NOTIFICATION_HEADER, 23, drbd_notification_header, + __u32_field(1, DRBD_GENLA_F_MANDATORY, nh_type) +) + +GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info, + __str_field(1, DRBD_GENLA_F_MANDATORY, helper_name, 32) + __u32_field(2, DRBD_GENLA_F_MANDATORY, helper_status) +) + /* * Notifications and commands (genlmsghdr->cmd) */ @@ -382,3 +452,82 @@ GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) + +GENL_op(DRBD_ADM_GET_RESOURCES, 30, + GENL_op_init( + .dumpit = drbd_adm_dump_resources, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + +GENL_op(DRBD_ADM_GET_DEVICES, 31, + GENL_op_init( + .dumpit = drbd_adm_dump_devices, + .done = drbd_adm_dump_devices_done, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + +GENL_op(DRBD_ADM_GET_CONNECTIONS, 32, + GENL_op_init( + .dumpit = drbd_adm_dump_connections, + .done = drbd_adm_dump_connections_done, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_GENLA_F_MANDATORY)) + +GENL_op(DRBD_ADM_GET_PEER_DEVICES, 33, + GENL_op_init( + .dumpit = drbd_adm_dump_peer_devices, + .done = drbd_adm_dump_peer_devices_done, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + +GENL_notification( + DRBD_RESOURCE_STATE, 34, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_F_REQUIRED)) + +GENL_notification( + DRBD_DEVICE_STATE, 35, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_F_REQUIRED)) + +GENL_notification( + DRBD_CONNECTION_STATE, 36, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_F_REQUIRED)) + +GENL_notification( + DRBD_PEER_DEVICE_STATE, 37, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_F_REQUIRED)) + +GENL_op( + DRBD_ADM_GET_INITIAL_STATE, 38, + GENL_op_init( + .dumpit = drbd_adm_get_initial_state, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)) + +GENL_notification( + DRBD_HELPER, 40, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_HELPER, DRBD_F_REQUIRED)) + +GENL_notification( + DRBD_INITIAL_STATE_DONE, 41, events, + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)) diff --git a/include/linux/idr.h b/include/linux/idr.h index 013fd9bc4cb6..083d61e92706 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -135,6 +135,20 @@ static inline void *idr_find(struct idr *idr, int id) #define idr_for_each_entry(idp, entry, id) \ for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id) +/** + * idr_for_each_entry - continue iteration over an idr's elements of a given type + * @idp: idr handle + * @entry: the type * to use as cursor + * @id: id entry's key + * + * Continue to iterate over list of given type, continuing after + * the current position. + */ +#define idr_for_each_entry_continue(idp, entry, id) \ + for ((entry) = idr_get_next((idp), &(id)); \ + entry; \ + ++id, (entry) = idr_get_next((idp), &(id))) + /* * IDA - IDR based id allocator, use when translation from id to * pointer isn't necessary. diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 034117b3be5f..d6750111e48e 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -1,6 +1,8 @@ #ifndef NVM_H #define NVM_H +#include <linux/types.h> + enum { NVM_IO_OK = 0, NVM_IO_REQUEUE = 1, @@ -11,12 +13,74 @@ enum { NVM_IOTYPE_GC = 1, }; +#define NVM_BLK_BITS (16) +#define NVM_PG_BITS (16) +#define NVM_SEC_BITS (8) +#define NVM_PL_BITS (8) +#define NVM_LUN_BITS (8) +#define NVM_CH_BITS (8) + +struct ppa_addr { + /* Generic structure for all addresses */ + union { + struct { + u64 blk : NVM_BLK_BITS; + u64 pg : NVM_PG_BITS; + u64 sec : NVM_SEC_BITS; + u64 pl : NVM_PL_BITS; + u64 lun : NVM_LUN_BITS; + u64 ch : NVM_CH_BITS; + } g; + + u64 ppa; + }; +}; + +struct nvm_rq; +struct nvm_id; +struct nvm_dev; + +typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); +typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); +typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); +typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, + nvm_l2p_update_fn *, void *); +typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, + nvm_bb_update_fn *, void *); +typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int); +typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); +typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); +typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); +typedef void (nvm_destroy_dma_pool_fn)(void *); +typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, + dma_addr_t *); +typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); + +struct nvm_dev_ops { + nvm_id_fn *identity; + nvm_get_l2p_tbl_fn *get_l2p_tbl; + nvm_op_bb_tbl_fn *get_bb_tbl; + nvm_op_set_bb_fn *set_bb_tbl; + + nvm_submit_io_fn *submit_io; + nvm_erase_blk_fn *erase_block; + + nvm_create_dma_pool_fn *create_dma_pool; + nvm_destroy_dma_pool_fn *destroy_dma_pool; + nvm_dev_dma_alloc_fn *dev_dma_alloc; + nvm_dev_dma_free_fn *dev_dma_free; + + unsigned int max_phys_sect; +}; + + + #ifdef CONFIG_NVM #include <linux/blkdev.h> -#include <linux/types.h> #include <linux/file.h> #include <linux/dmapool.h> +#include <uapi/linux/lightnvm.h> enum { /* HW Responsibilities */ @@ -58,8 +122,29 @@ enum { /* Block Types */ NVM_BLK_T_FREE = 0x0, NVM_BLK_T_BAD = 0x1, - NVM_BLK_T_DEV = 0x2, - NVM_BLK_T_HOST = 0x4, + NVM_BLK_T_GRWN_BAD = 0x2, + NVM_BLK_T_DEV = 0x4, + NVM_BLK_T_HOST = 0x8, + + /* Memory capabilities */ + NVM_ID_CAP_SLC = 0x1, + NVM_ID_CAP_CMD_SUSPEND = 0x2, + NVM_ID_CAP_SCRAMBLE = 0x4, + NVM_ID_CAP_ENCRYPT = 0x8, + + /* Memory types */ + NVM_ID_FMTYPE_SLC = 0, + NVM_ID_FMTYPE_MLC = 1, +}; + +struct nvm_id_lp_mlc { + u16 num_pairs; + u8 pairs[886]; +}; + +struct nvm_id_lp_tbl { + __u8 id[8]; + struct nvm_id_lp_mlc mlc; }; struct nvm_id_group { @@ -82,6 +167,8 @@ struct nvm_id_group { u32 mpos; u32 mccap; u16 cpar; + + struct nvm_id_lp_tbl lptbl; }; struct nvm_addr_format { @@ -125,28 +212,8 @@ struct nvm_tgt_instance { #define NVM_VERSION_MINOR 0 #define NVM_VERSION_PATCH 0 -#define NVM_BLK_BITS (16) -#define NVM_PG_BITS (16) -#define NVM_SEC_BITS (8) -#define NVM_PL_BITS (8) -#define NVM_LUN_BITS (8) -#define NVM_CH_BITS (8) - -struct ppa_addr { - /* Generic structure for all addresses */ - union { - struct { - u64 blk : NVM_BLK_BITS; - u64 pg : NVM_PG_BITS; - u64 sec : NVM_SEC_BITS; - u64 pl : NVM_PL_BITS; - u64 lun : NVM_LUN_BITS; - u64 ch : NVM_CH_BITS; - } g; - - u64 ppa; - }; -}; +struct nvm_rq; +typedef void (nvm_end_io_fn)(struct nvm_rq *); struct nvm_rq { struct nvm_tgt_instance *ins; @@ -164,9 +231,14 @@ struct nvm_rq { void *metadata; dma_addr_t dma_metadata; + struct completion *wait; + nvm_end_io_fn *end_io; + uint8_t opcode; uint16_t nr_pages; uint16_t flags; + + int error; }; static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu) @@ -181,51 +253,31 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata) struct nvm_block; -typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); -typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); -typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); -typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, - nvm_l2p_update_fn *, void *); -typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, - nvm_bb_update_fn *, void *); -typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int); -typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); -typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); -typedef void (nvm_destroy_dma_pool_fn)(void *); -typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, - dma_addr_t *); -typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); - -struct nvm_dev_ops { - nvm_id_fn *identity; - nvm_get_l2p_tbl_fn *get_l2p_tbl; - nvm_op_bb_tbl_fn *get_bb_tbl; - nvm_op_set_bb_fn *set_bb_tbl; - - nvm_submit_io_fn *submit_io; - nvm_erase_blk_fn *erase_block; - - nvm_create_dma_pool_fn *create_dma_pool; - nvm_destroy_dma_pool_fn *destroy_dma_pool; - nvm_dev_dma_alloc_fn *dev_dma_alloc; - nvm_dev_dma_free_fn *dev_dma_free; - - unsigned int max_phys_sect; -}; - struct nvm_lun { int id; int lun_id; int chnl_id; - unsigned int nr_inuse_blocks; /* Number of used blocks */ + /* It is up to the target to mark blocks as closed. If the target does + * not do it, all blocks are marked as open, and nr_open_blocks + * represents the number of blocks in use + */ + unsigned int nr_open_blocks; /* Number of used, writable blocks */ + unsigned int nr_closed_blocks; /* Number of used, read-only blocks */ unsigned int nr_free_blocks; /* Number of unused blocks */ unsigned int nr_bad_blocks; /* Number of bad blocks */ - struct nvm_block *blocks; spinlock_t lock; + + struct nvm_block *blocks; +}; + +enum { + NVM_BLK_ST_FREE = 0x1, /* Free block */ + NVM_BLK_ST_OPEN = 0x2, /* Open block - read-write */ + NVM_BLK_ST_CLOSED = 0x4, /* Closed block - read-only */ + NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; struct nvm_block { @@ -234,7 +286,16 @@ struct nvm_block { unsigned long id; void *priv; - int type; + int state; +}; + +/* system block cpu representation */ +struct nvm_sb_info { + unsigned long seqnr; + unsigned long erase_cnt; + unsigned int version; + char mmtype[NVM_MMTYPE_LEN]; + struct ppa_addr fs_ppa; }; struct nvm_dev { @@ -247,6 +308,9 @@ struct nvm_dev { struct nvmm_type *mt; void *mp; + /* System blocks */ + struct nvm_sb_info sb; + /* Device information */ int nr_chnls; int nr_planes; @@ -256,6 +320,7 @@ struct nvm_dev { int blks_per_lun; int sec_size; int oob_size; + int mccap; struct nvm_addr_format ppaf; /* Calculated/Cached values. These do not reflect the actual usable @@ -268,6 +333,10 @@ struct nvm_dev { int sec_per_blk; int sec_per_lun; + /* lower page table */ + int lps_per_blk; + int *lptbl; + unsigned long total_pages; unsigned long total_blocks; int nr_luns; @@ -280,6 +349,8 @@ struct nvm_dev { /* Backend device */ struct request_queue *q; char name[DISK_NAME_LEN]; + + struct mutex mlock; }; static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, @@ -345,9 +416,13 @@ static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev, return ppa; } +static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) +{ + return dev->lptbl[slc_pg]; +} + typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); -typedef int (nvm_tgt_end_io_fn)(struct nvm_rq *, int); typedef void *(nvm_tgt_init_fn)(struct nvm_dev *, struct gendisk *, int, int); typedef void (nvm_tgt_exit_fn)(void *); @@ -358,7 +433,7 @@ struct nvm_tgt_type { /* target entry points */ nvm_tgt_make_rq_fn *make_rq; nvm_tgt_capacity_fn *capacity; - nvm_tgt_end_io_fn *end_io; + nvm_end_io_fn *end_io; /* module-specific init/teardown */ nvm_tgt_init_fn *init; @@ -383,7 +458,6 @@ typedef int (nvmm_open_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef int (nvmm_close_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef void (nvmm_flush_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvmm_end_io_fn)(struct nvm_rq *, int); typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, unsigned long); typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int); @@ -397,6 +471,8 @@ struct nvmm_type { nvmm_unregister_fn *unregister_mgr; /* Block administration callbacks */ + nvmm_get_blk_fn *get_blk_unlocked; + nvmm_put_blk_fn *put_blk_unlocked; nvmm_get_blk_fn *get_blk; nvmm_put_blk_fn *put_blk; nvmm_open_blk_fn *open_blk; @@ -404,7 +480,6 @@ struct nvmm_type { nvmm_flush_blk_fn *flush_blk; nvmm_submit_io_fn *submit_io; - nvmm_end_io_fn *end_io; nvmm_erase_blk_fn *erase_blk; /* Configuration management */ @@ -418,6 +493,10 @@ struct nvmm_type { extern int nvm_register_mgr(struct nvmm_type *); extern void nvm_unregister_mgr(struct nvmm_type *); +extern struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *, + struct nvm_lun *, unsigned long); +extern void nvm_put_blk_unlocked(struct nvm_dev *, struct nvm_block *); + extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *, unsigned long); extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *); @@ -427,7 +506,36 @@ extern int nvm_register(struct request_queue *, char *, extern void nvm_unregister(char *); extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *); +extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); +extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); +extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, + struct ppa_addr *, int); +extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); +extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int); extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); +extern void nvm_end_io(struct nvm_rq *, int); +extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, + void *, int); + +/* sysblk.c */ +#define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */ + +/* system block on disk representation */ +struct nvm_system_block { + __be32 magic; /* magic signature */ + __be32 seqnr; /* sequence number */ + __be32 erase_cnt; /* erase count */ + __be16 version; /* version number */ + u8 mmtype[NVM_MMTYPE_LEN]; /* media manager name */ + __be64 fs_ppa; /* PPA for media manager + * superblock */ +}; + +extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *); +extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *); +extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *); + +extern int nvm_dev_factory(struct nvm_dev *, int flags); #else /* CONFIG_NVM */ struct nvm_dev_ops; diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index 46262284de47..04fc6e6c7ff0 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h @@ -264,7 +264,7 @@ extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); extern void lc_committed(struct lru_cache *lc); struct seq_file; -extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); +extern void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, void (*detail) (struct seq_file *, struct lc_element *)); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 3af5f454c04a..a55986f6fe38 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -17,20 +17,19 @@ #include <linux/types.h> -struct nvme_bar { - __u64 cap; /* Controller Capabilities */ - __u32 vs; /* Version */ - __u32 intms; /* Interrupt Mask Set */ - __u32 intmc; /* Interrupt Mask Clear */ - __u32 cc; /* Controller Configuration */ - __u32 rsvd1; /* Reserved */ - __u32 csts; /* Controller Status */ - __u32 nssr; /* Subsystem Reset */ - __u32 aqa; /* Admin Queue Attributes */ - __u64 asq; /* Admin SQ Base Address */ - __u64 acq; /* Admin CQ Base Address */ - __u32 cmbloc; /* Controller Memory Buffer Location */ - __u32 cmbsz; /* Controller Memory Buffer Size */ +enum { + NVME_REG_CAP = 0x0000, /* Controller Capabilities */ + NVME_REG_VS = 0x0008, /* Version */ + NVME_REG_INTMS = 0x000c, /* Interrupt Mask Set */ + NVME_REG_INTMC = 0x0010, /* Interrupt Mask Set */ + NVME_REG_CC = 0x0014, /* Controller Configuration */ + NVME_REG_CSTS = 0x001c, /* Controller Status */ + NVME_REG_NSSR = 0x0020, /* NVM Subsystem Reset */ + NVME_REG_AQA = 0x0024, /* Admin Queue Attributes */ + NVME_REG_ASQ = 0x0028, /* Admin SQ Base Address */ + NVME_REG_ACQ = 0x0030, /* Admin SQ Base Address */ + NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ + NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ }; #define NVME_CAP_MQES(cap) ((cap) & 0xffff) diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index c2e5d6cb34e3..ebd10e624598 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -307,7 +307,7 @@ header-y += nfs_mount.h header-y += nl80211.h header-y += n_r3964.h header-y += nubus.h -header-y += nvme.h +header-y += nvme_ioctl.h header-y += nvram.h header-y += omap3isp.h header-y += omapfb.h diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index 928f98997d8a..774a43128a7a 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h @@ -33,6 +33,7 @@ #define NVM_TTYPE_NAME_MAX 48 #define NVM_TTYPE_MAX 63 +#define NVM_MMTYPE_LEN 8 #define NVM_CTRL_FILE "/dev/lightnvm/control" @@ -100,6 +101,26 @@ struct nvm_ioctl_remove { __u32 flags; }; +struct nvm_ioctl_dev_init { + char dev[DISK_NAME_LEN]; /* open-channel SSD device */ + char mmtype[NVM_MMTYPE_LEN]; /* register to media manager */ + + __u32 flags; +}; + +enum { + NVM_FACTORY_ERASE_ONLY_USER = 1 << 0, /* erase only blocks used as + * host blks or grown blks */ + NVM_FACTORY_RESET_HOST_BLKS = 1 << 1, /* remove host blk marks */ + NVM_FACTORY_RESET_GRWN_BBLKS = 1 << 2, /* remove grown blk marks */ + NVM_FACTORY_NR_BITS = 1 << 3, /* stops here */ +}; + +struct nvm_ioctl_dev_factory { + char dev[DISK_NAME_LEN]; + + __u32 flags; +}; /* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */ enum { @@ -110,6 +131,12 @@ enum { /* device level cmds */ NVM_DEV_CREATE_CMD, NVM_DEV_REMOVE_CMD, + + /* Init a device to support LightNVM media managers */ + NVM_DEV_INIT_CMD, + + /* Factory reset device */ + NVM_DEV_FACTORY_CMD, }; #define NVM_IOCTL 'L' /* 0x4c */ @@ -122,6 +149,10 @@ enum { struct nvm_ioctl_create) #define NVM_DEV_REMOVE _IOW(NVM_IOCTL, NVM_DEV_REMOVE_CMD, \ struct nvm_ioctl_remove) +#define NVM_DEV_INIT _IOW(NVM_IOCTL, NVM_DEV_INIT_CMD, \ + struct nvm_ioctl_dev_init) +#define NVM_DEV_FACTORY _IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \ + struct nvm_ioctl_dev_factory) #define NVM_VERSION_MAJOR 1 #define NVM_VERSION_MINOR 0 diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index c33e1c489eb2..8b8cfadf7833 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -28,6 +28,54 @@ typedef uint16_t blkif_vdev_t; typedef uint64_t blkif_sector_t; /* + * Multiple hardware queues/rings: + * If supported, the backend will write the key "multi-queue-max-queues" to + * the directory for that vbd, and set its value to the maximum supported + * number of queues. + * Frontends that are aware of this feature and wish to use it can write the + * key "multi-queue-num-queues" with the number they wish to use, which must be + * greater than zero, and no more than the value reported by the backend in + * "multi-queue-max-queues". + * + * For frontends requesting just one queue, the usual event-channel and + * ring-ref keys are written as before, simplifying the backend processing + * to avoid distinguishing between a frontend that doesn't understand the + * multi-queue feature, and one that does, but requested only one queue. + * + * Frontends requesting two or more queues must not write the toplevel + * event-channel and ring-ref keys, instead writing those keys under sub-keys + * having the name "queue-N" where N is the integer ID of the queue/ring for + * which those keys belong. Queues are indexed from zero. + * For example, a frontend with two queues must write the following set of + * queue-related keys: + * + * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2" + * /local/domain/1/device/vbd/0/queue-0 = "" + * /local/domain/1/device/vbd/0/queue-0/ring-ref = "<ring-ref#0>" + * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>" + * /local/domain/1/device/vbd/0/queue-1 = "" + * /local/domain/1/device/vbd/0/queue-1/ring-ref = "<ring-ref#1>" + * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>" + * + * It is also possible to use multiple queues/rings together with + * feature multi-page ring buffer. + * For example, a frontend requests two queues/rings and the size of each ring + * buffer is two pages must write the following set of related keys: + * + * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2" + * /local/domain/1/device/vbd/0/ring-page-order = "1" + * /local/domain/1/device/vbd/0/queue-0 = "" + * /local/domain/1/device/vbd/0/queue-0/ring-ref0 = "<ring-ref#0>" + * /local/domain/1/device/vbd/0/queue-0/ring-ref1 = "<ring-ref#1>" + * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>" + * /local/domain/1/device/vbd/0/queue-1 = "" + * /local/domain/1/device/vbd/0/queue-1/ring-ref0 = "<ring-ref#2>" + * /local/domain/1/device/vbd/0/queue-1/ring-ref1 = "<ring-ref#3>" + * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>" + * + */ + +/* * REQUEST CODES. */ #define BLKIF_OP_READ 0 diff --git a/lib/lru_cache.c b/lib/lru_cache.c index 028f5d996eef..28ba40b99337 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c @@ -238,7 +238,7 @@ void lc_reset(struct lru_cache *lc) * @seq: the seq_file to print into * @lc: the lru cache to print statistics of */ -size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) +void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) { /* NOTE: * total calls to lc_get are @@ -250,8 +250,6 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) seq_printf(seq, "\t%s: used:%u/%u hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n", lc->name, lc->used, lc->nr_elements, lc->hits, lc->misses, lc->starving, lc->locked, lc->changed); - - return 0; } static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) |