diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-18 16:50:08 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-18 16:50:08 -0800 |
commit | 77a0cfafa9af9c0d5b43534eb90d530c189edca1 (patch) | |
tree | 30e513ad7c9732213cede4c00bb1651e1a08b023 /block | |
parent | 3d1b536c13f7cd966aa660d1730855b26d01c9ae (diff) | |
parent | 88d47f629313730f26a3b00224d1e1a5e3b7bb79 (diff) |
Merge tag 'for-6.13/block-20241118' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
- NVMe updates via Keith:
- Use uring_cmd helper (Pavel)
- Host Memory Buffer allocation enhancements (Christoph)
- Target persistent reservation support (Guixin)
- Persistent reservation tracing (Guixen)
- NVMe 2.1 specification support (Keith)
- Rotational Meta Support (Matias, Wang, Keith)
- Volatile cache detection enhancment (Guixen)
- MD updates via Song:
- Maintainers update
- raid5 sync IO fix
- Enhance handling of faulty and blocked devices
- raid5-ppl atomic improvement
- md-bitmap fix
- Support for manually defining embedded partition tables
- Zone append fixes and cleanups
- Stop sending the queued requests in the plug list to the driver
->queue_rqs() handle in reverse order.
- Zoned write plug cleanups
- Cleanups disk stats tracking and add support for disk stats for
passthrough IO
- Add preparatory support for file system atomic writes
- Add lockdep support for queue freezing. Already found a bunch of
issues, and some fixes for that are in here. More will be coming.
- Fix race between queue stopping/quiescing and IO queueing
- ublk recovery improvements
- Fix ublk mmap for 64k pages
- Various fixes and cleanups
* tag 'for-6.13/block-20241118' of git://git.kernel.dk/linux: (118 commits)
MAINTAINERS: Update git tree for mdraid subsystem
block: make struct rq_list available for !CONFIG_BLOCK
block/genhd: use seq_put_decimal_ull for diskstats decimal values
block: don't reorder requests in blk_mq_add_to_batch
block: don't reorder requests in blk_add_rq_to_plug
block: add a rq_list type
block: remove rq_list_move
virtio_blk: reverse request order in virtio_queue_rqs
nvme-pci: reverse request order in nvme_queue_rqs
btrfs: validate queue limits
block: export blk_validate_limits
nvmet: add tracing of reservation commands
nvme: parse reservation commands's action and rtype to string
nvmet: report ns's vwc not present
md/raid5: Increase r5conf.cache_name size
block: remove the ioprio field from struct request
block: remove the write_hint field from struct request
nvme: check ns's volatile write cache not present
nvme: add rotational support
nvme: use command set independent id ns if available
...
Diffstat (limited to 'block')
-rw-r--r-- | block/bio-integrity.c | 13 | ||||
-rw-r--r-- | block/bio.c | 81 | ||||
-rw-r--r-- | block/blk-core.c | 26 | ||||
-rw-r--r-- | block/blk-crypto-fallback.c | 2 | ||||
-rw-r--r-- | block/blk-integrity.c | 4 | ||||
-rw-r--r-- | block/blk-ioc.c | 9 | ||||
-rw-r--r-- | block/blk-merge.c | 107 | ||||
-rw-r--r-- | block/blk-mq.c | 307 | ||||
-rw-r--r-- | block/blk-mq.h | 15 | ||||
-rw-r--r-- | block/blk-rq-qos.c | 4 | ||||
-rw-r--r-- | block/blk-settings.c | 40 | ||||
-rw-r--r-- | block/blk-sysfs.c | 80 | ||||
-rw-r--r-- | block/blk-throttle.c | 76 | ||||
-rw-r--r-- | block/blk-zoned.c | 68 | ||||
-rw-r--r-- | block/blk.h | 52 | ||||
-rw-r--r-- | block/elevator.c | 18 | ||||
-rw-r--r-- | block/elevator.h | 4 | ||||
-rw-r--r-- | block/genhd.c | 136 | ||||
-rw-r--r-- | block/partitions/Kconfig | 9 | ||||
-rw-r--r-- | block/partitions/Makefile | 1 | ||||
-rw-r--r-- | block/partitions/check.h | 1 | ||||
-rw-r--r-- | block/partitions/cmdline.c | 3 | ||||
-rw-r--r-- | block/partitions/core.c | 8 | ||||
-rw-r--r-- | block/partitions/of.c | 110 | ||||
-rw-r--r-- | block/sed-opal.c | 26 |
25 files changed, 760 insertions, 440 deletions
diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 88e3ad73c385..2a4bd6611692 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -199,7 +199,7 @@ EXPORT_SYMBOL(bio_integrity_add_page); static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, int nr_vecs, unsigned int len, - unsigned int direction, u32 seed) + unsigned int direction) { bool write = direction == ITER_SOURCE; struct bio_integrity_payload *bip; @@ -247,7 +247,6 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, } bip->bip_flags |= BIP_COPY_USER; - bip->bip_iter.bi_sector = seed; bip->bip_vcnt = nr_vecs; return 0; free_bip: @@ -258,7 +257,7 @@ free_buf: } static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, - int nr_vecs, unsigned int len, u32 seed) + int nr_vecs, unsigned int len) { struct bio_integrity_payload *bip; @@ -267,7 +266,6 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, return PTR_ERR(bip); memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec)); - bip->bip_iter.bi_sector = seed; bip->bip_iter.bi_size = len; bip->bip_vcnt = nr_vecs; return 0; @@ -303,8 +301,7 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, return nr_bvecs; } -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, - u32 seed) +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); @@ -350,9 +347,9 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, if (copy) ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes, - direction, seed); + direction); else - ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed); + ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes); if (ret) goto release_pages; if (bvec != stack_vec) diff --git a/block/bio.c b/block/bio.c index ac4d77c88932..699a78c85c75 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1065,39 +1065,6 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, EXPORT_SYMBOL(bio_add_pc_page); /** - * bio_add_zone_append_page - attempt to add page to zone-append bio - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist of a bio that will be submitted - * for a zone-append request. This can fail for a number of reasons, such as the - * bio being full or the target block device is not a zoned block device or - * other limitations of the target block device. The target block device must - * allow bio's up to PAGE_SIZE, so it is always possible to add a single page - * to an empty bio. - * - * Returns: number of bytes added to the bio, or 0 in case of a failure. - */ -int bio_add_zone_append_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - bool same_page = false; - - if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) - return 0; - - if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev))) - return 0; - - return bio_add_hw_page(q, bio, page, len, offset, - queue_max_zone_append_sectors(q), &same_page); -} -EXPORT_SYMBOL_GPL(bio_add_zone_append_page); - -/** * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio * @page: start page to add @@ -1206,21 +1173,12 @@ EXPORT_SYMBOL_GPL(__bio_release_pages); void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { - size_t size = iov_iter_count(iter); - WARN_ON_ONCE(bio->bi_max_vecs); - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - size_t max_sectors = queue_max_zone_append_sectors(q); - - size = min(size, max_sectors << SECTOR_SHIFT); - } - bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; - bio->bi_iter.bi_size = size; + bio->bi_iter.bi_size = iov_iter_count(iter); bio_set_flag(bio, BIO_CLONED); } @@ -1245,20 +1203,6 @@ static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len, return 0; } -static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio, - size_t len, size_t offset) -{ - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - bool same_page = false; - - if (bio_add_hw_folio(q, bio, folio, len, offset, - queue_max_zone_append_sectors(q), &same_page) != len) - return -EINVAL; - if (same_page && bio_flagged(bio, BIO_PAGE_PINNED)) - unpin_user_folio(folio, 1); - return 0; -} - static unsigned int get_contig_folio_len(unsigned int *num_pages, struct page **pages, unsigned int i, struct folio *folio, size_t left, @@ -1365,14 +1309,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) len = get_contig_folio_len(&num_pages, pages, i, folio, left, offset); - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = bio_iov_add_zone_append_folio(bio, folio, len, - folio_offset); - if (ret) - break; - } else - bio_iov_add_folio(bio, folio, len, folio_offset); - + bio_iov_add_folio(bio, folio, len, folio_offset); offset = 0; } @@ -1728,16 +1665,22 @@ struct bio *bio_split(struct bio *bio, int sectors, { struct bio *split; - BUG_ON(sectors <= 0); - BUG_ON(sectors >= bio_sectors(bio)); + if (WARN_ON_ONCE(sectors <= 0)) + return ERR_PTR(-EINVAL); + if (WARN_ON_ONCE(sectors >= bio_sectors(bio))) + return ERR_PTR(-EINVAL); /* Zone append commands cannot be split */ if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) - return NULL; + return ERR_PTR(-EINVAL); + + /* atomic writes cannot be split */ + if (bio->bi_opf & REQ_ATOMIC) + return ERR_PTR(-EINVAL); split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); if (!split) - return NULL; + return ERR_PTR(-ENOMEM); split->bi_iter.bi_size = sectors << 9; diff --git a/block/blk-core.c b/block/blk-core.c index bc5e8c5eaac9..666efe8fa202 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -261,6 +261,8 @@ static void blk_free_queue(struct request_queue *q) blk_mq_release(q); ida_free(&blk_queue_ida, q->id); + lockdep_unregister_key(&q->io_lock_cls_key); + lockdep_unregister_key(&q->q_lock_cls_key); call_rcu(&q->rcu_head, blk_free_queue_rcu); } @@ -278,18 +280,20 @@ void blk_put_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_put_queue); -void blk_queue_start_drain(struct request_queue *q) +bool blk_queue_start_drain(struct request_queue *q) { /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ - blk_freeze_queue_start(q); + bool freeze = __blk_freeze_queue_start(q, current); if (queue_is_mq(q)) blk_mq_wake_waiters(q); /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); + + return freeze; } /** @@ -321,6 +325,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) return -ENODEV; } + rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->q_lockdep_map, _RET_IP_); return 0; } @@ -352,6 +358,8 @@ int __bio_queue_enter(struct request_queue *q, struct bio *bio) goto dead; } + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; dead: bio_io_error(bio); @@ -441,6 +449,12 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); if (error) goto fail_stats; + lockdep_register_key(&q->io_lock_cls_key); + lockdep_register_key(&q->q_lock_cls_key); + lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)", + &q->io_lock_cls_key, 0); + lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)", + &q->q_lock_cls_key, 0); q->nr_requests = BLKDEV_DEFAULT_RQ; @@ -593,7 +607,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_IOERR; /* Make sure the BIO is small enough and will not get split */ - if (nr_sectors > queue_max_zone_append_sectors(q)) + if (nr_sectors > q->limits.max_zone_append_sectors) return BLK_STS_IOERR; bio->bi_opf |= REQ_NOMERGE; @@ -1106,8 +1120,8 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) return; plug->cur_ktime = 0; - plug->mq_list = NULL; - plug->cached_rq = NULL; + rq_list_init(&plug->mq_list); + rq_list_init(&plug->cached_rqs); plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); plug->rq_count = 0; plug->multiple_queues = false; @@ -1203,7 +1217,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) * queue for cached requests, we don't want a blocked task holding * up a queue freeze/quiesce event. */ - if (unlikely(!rq_list_empty(plug->cached_rq))) + if (unlikely(!rq_list_empty(&plug->cached_rqs))) blk_mq_free_plug_rqs(plug); plug->cur_ktime = 0; diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index b1e7415f8439..29a205482617 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -226,7 +226,7 @@ static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) split_bio = bio_split(bio, num_sectors, GFP_NOIO, &crypto_bio_split); - if (!split_bio) { + if (IS_ERR(split_bio)) { bio->bi_status = BLK_STS_RESOURCE; return false; } diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 83b696ba0cac..b180cac61a9d 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -113,9 +113,9 @@ new_segment: EXPORT_SYMBOL(blk_rq_map_integrity_sg); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, - ssize_t bytes, u32 seed) + ssize_t bytes) { - int ret = bio_integrity_map_user(rq->bio, ubuf, bytes, seed); + int ret = bio_integrity_map_user(rq->bio, ubuf, bytes); if (ret) return ret; diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 25dd4db11121..ce82770c72ab 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -32,13 +32,6 @@ static void get_io_context(struct io_context *ioc) atomic_long_inc(&ioc->refcount); } -static void icq_free_icq_rcu(struct rcu_head *head) -{ - struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); - - kmem_cache_free(icq->__rcu_icq_cache, icq); -} - /* * Exit an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. @@ -102,7 +95,7 @@ static void ioc_destroy_icq(struct io_cq *icq) */ icq->__rcu_icq_cache = et->icq_cache; icq->flags |= ICQ_DESTROYED; - call_rcu(&icq->__rcu_head, icq_free_icq_rcu); + kfree_rcu(icq, __rcu_head); } /* diff --git a/block/blk-merge.c b/block/blk-merge.c index ad763ec313b6..e0b28e9298c9 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -107,17 +107,18 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) static struct bio *bio_submit_split(struct bio *bio, int split_sectors) { - if (unlikely(split_sectors < 0)) { - bio->bi_status = errno_to_blk_status(split_sectors); - bio_endio(bio); - return NULL; - } + if (unlikely(split_sectors < 0)) + goto error; if (split_sectors) { struct bio *split; split = bio_split(bio, split_sectors, GFP_NOIO, &bio->bi_bdev->bd_disk->bio_split); + if (IS_ERR(split)) { + split_sectors = PTR_ERR(split); + goto error; + } split->bi_opf |= REQ_NOMERGE; blkcg_bio_issue_init(split); bio_chain(split, bio); @@ -128,6 +129,10 @@ static struct bio *bio_submit_split(struct bio *bio, int split_sectors) } return bio; +error: + bio->bi_status = errno_to_blk_status(split_sectors); + bio_endio(bio); + return NULL; } struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, @@ -166,17 +171,6 @@ struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, return bio_submit_split(bio, split_sectors); } -struct bio *bio_split_write_zeroes(struct bio *bio, - const struct queue_limits *lim, unsigned *nsegs) -{ - *nsegs = 0; - if (!lim->max_write_zeroes_sectors) - return bio; - if (bio_sectors(bio) <= lim->max_write_zeroes_sectors) - return bio; - return bio_submit_split(bio, lim->max_write_zeroes_sectors); -} - static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, bool is_atomic) { @@ -211,7 +205,9 @@ static inline unsigned get_max_io_size(struct bio *bio, * We ignore lim->max_sectors for atomic writes because it may less * than the actual bio size, which we cannot tolerate. */ - if (is_atomic) + if (bio_op(bio) == REQ_OP_WRITE_ZEROES) + max_sectors = lim->max_write_zeroes_sectors; + else if (is_atomic) max_sectors = lim->atomic_write_max_sectors; else max_sectors = lim->max_sectors; @@ -296,6 +292,14 @@ static bool bvec_split_segs(const struct queue_limits *lim, return len > 0 || bv->bv_len > max_len; } +static unsigned int bio_split_alignment(struct bio *bio, + const struct queue_limits *lim) +{ + if (op_is_write(bio_op(bio)) && lim->zone_write_granularity) + return lim->zone_write_granularity; + return lim->logical_block_size; +} + /** * bio_split_rw_at - check if and where to split a read/write bio * @bio: [in] bio to be split @@ -358,7 +362,7 @@ split: * split size so that each bio is properly block size aligned, even if * we do not use the full hardware limits. */ - bytes = ALIGN_DOWN(bytes, lim->logical_block_size); + bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim)); /* * Bio splitting may cause subtle trouble such as hang when doing sync @@ -388,16 +392,35 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, struct bio *bio_split_zone_append(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs) { - unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim); int split_sectors; split_sectors = bio_split_rw_at(bio, lim, nr_segs, - max_sectors << SECTOR_SHIFT); + lim->max_zone_append_sectors << SECTOR_SHIFT); if (WARN_ON_ONCE(split_sectors > 0)) split_sectors = -EINVAL; return bio_submit_split(bio, split_sectors); } +struct bio *bio_split_write_zeroes(struct bio *bio, + const struct queue_limits *lim, unsigned *nsegs) +{ + unsigned int max_sectors = get_max_io_size(bio, lim); + + *nsegs = 0; + + /* + * An unset limit should normally not happen, as bio submission is keyed + * off having a non-zero limit. But SCSI can clear the limit in the + * I/O completion handler, and we can race and see this. Splitting to a + * zero limit obviously doesn't make sense, so band-aid it here. + */ + if (!max_sectors) + return bio; + if (bio_sectors(bio) <= max_sectors) + return bio; + return bio_submit_split(bio, max_sectors); +} + /** * bio_split_to_limits - split a bio to fit the queue limits * @bio: bio to be split @@ -411,10 +434,9 @@ struct bio *bio_split_zone_append(struct bio *bio, */ struct bio *bio_split_to_limits(struct bio *bio) { - const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; unsigned int nr_segs; - return __bio_split_to_limits(bio, lim, &nr_segs); + return __bio_split_to_limits(bio, bdev_limits(bio->bi_bdev), &nr_segs); } EXPORT_SYMBOL(bio_split_to_limits); @@ -797,7 +819,7 @@ static inline void blk_update_mixed_merge(struct request *req, static void blk_account_io_merge_request(struct request *req) { - if (blk_do_io_stat(req)) { + if (req->rq_flags & RQF_IO_STAT) { part_stat_lock(); part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); part_stat_local_dec(req->part, @@ -845,12 +867,13 @@ static struct request *attempt_merge(struct request_queue *q, if (rq_data_dir(req) != rq_data_dir(next)) return NULL; - /* Don't merge requests with different write hints. */ - if (req->write_hint != next->write_hint) - return NULL; - - if (req->ioprio != next->ioprio) - return NULL; + if (req->bio && next->bio) { + /* Don't merge requests with different write hints. */ + if (req->bio->bi_write_hint != next->bio->bi_write_hint) + return NULL; + if (req->bio->bi_ioprio != next->bio->bi_ioprio) + return NULL; + } if (!blk_atomic_write_mergeable_rqs(req, next)) return NULL; @@ -979,12 +1002,13 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (!bio_crypt_rq_ctx_compatible(rq, bio)) return false; - /* Don't merge requests with different write hints. */ - if (rq->write_hint != bio->bi_write_hint) - return false; - - if (rq->ioprio != bio_prio(bio)) - return false; + if (rq->bio) { + /* Don't merge requests with different write hints. */ + if (rq->bio->bi_write_hint != bio->bi_write_hint) + return false; + if (rq->bio->bi_ioprio != bio->bi_ioprio) + return false; + } if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) return false; @@ -1005,12 +1029,11 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio) static void blk_account_io_merge_bio(struct request *req) { - if (!blk_do_io_stat(req)) - return; - - part_stat_lock(); - part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); - part_stat_unlock(); + if (req->rq_flags & RQF_IO_STAT) { + part_stat_lock(); + part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + part_stat_unlock(); + } } enum bio_merge_status bio_attempt_back_merge(struct request *req, @@ -1156,7 +1179,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, struct blk_plug *plug = current->plug; struct request *rq; - if (!plug || rq_list_empty(plug->mq_list)) + if (!plug || rq_list_empty(&plug->mq_list)) return false; rq_list_for_each(&plug->mq_list, rq) { diff --git a/block/blk-mq.c b/block/blk-mq.c index cf626e061dd7..270cfd9fc6b0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -92,7 +92,7 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv) { struct mq_inflight *mi = priv; - if (rq->part && blk_do_io_stat(rq) && + if (rq->rq_flags & RQF_IO_STAT && (!bdev_is_partition(mi->part) || rq->part == mi->part) && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; @@ -120,9 +120,59 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, inflight[1] = mi.inflight[1]; } -void blk_freeze_queue_start(struct request_queue *q) +#ifdef CONFIG_LOCKDEP +static bool blk_freeze_set_owner(struct request_queue *q, + struct task_struct *owner) +{ + if (!owner) + return false; + + if (!q->mq_freeze_depth) { + q->mq_freeze_owner = owner; + q->mq_freeze_owner_depth = 1; + return true; + } + + if (owner == q->mq_freeze_owner) + q->mq_freeze_owner_depth += 1; + return false; +} + +/* verify the last unfreeze in owner context */ +static bool blk_unfreeze_check_owner(struct request_queue *q) +{ + if (!q->mq_freeze_owner) + return false; + if (q->mq_freeze_owner != current) + return false; + if (--q->mq_freeze_owner_depth == 0) { + q->mq_freeze_owner = NULL; + return true; + } + return false; +} + +#else + +static bool blk_freeze_set_owner(struct request_queue *q, + struct task_struct *owner) { + return false; +} + +static bool blk_unfreeze_check_owner(struct request_queue *q) +{ + return false; +} +#endif + +bool __blk_freeze_queue_start(struct request_queue *q, + struct task_struct *owner) +{ + bool freeze; + mutex_lock(&q->mq_freeze_lock); + freeze = blk_freeze_set_owner(q, owner); if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); mutex_unlock(&q->mq_freeze_lock); @@ -131,6 +181,14 @@ void blk_freeze_queue_start(struct request_queue *q) } else { mutex_unlock(&q->mq_freeze_lock); } + + return freeze; +} + +void blk_freeze_queue_start(struct request_queue *q) +{ + if (__blk_freeze_queue_start(q, current)) + blk_freeze_acquire_lock(q, false, false); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -149,35 +207,17 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); -/* - * Guarantee no request is in use, so we can change any data structure of - * the queue afterward. - */ -void blk_freeze_queue(struct request_queue *q) +void blk_mq_freeze_queue(struct request_queue *q) { - /* - * In the !blk_mq case we are only calling this to kill the - * q_usage_counter, otherwise this increases the freeze depth - * and waits for it to return to zero. For this reason there is - * no blk_unfreeze_queue(), and blk_freeze_queue() is not - * exported to drivers as the only user for unfreeze is blk_mq. - */ blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } - -void blk_mq_freeze_queue(struct request_queue *q) -{ - /* - * ...just an alias to keep freeze and unfreeze actions balanced - * in the blk_mq_* namespace - */ - blk_freeze_queue(q); -} EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); -void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) +bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { + bool unfreeze; + mutex_lock(&q->mq_freeze_lock); if (force_atomic) q->q_usage_counter.data->force_atomic = true; @@ -187,16 +227,40 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } + unfreeze = blk_unfreeze_check_owner(q); mutex_unlock(&q->mq_freeze_lock); + + return unfreeze; } void blk_mq_unfreeze_queue(struct request_queue *q) { - __blk_mq_unfreeze_queue(q, false); + if (__blk_mq_unfreeze_queue(q, false)) + blk_unfreeze_release_lock(q, false, false); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); /* + * non_owner variant of blk_freeze_queue_start + * + * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen + * by the same task. This is fragile and should not be used if at all + * possible. + */ +void blk_freeze_queue_start_non_owner(struct request_queue *q) +{ + __blk_freeze_queue_start(q, NULL); +} +EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); + +/* non_owner variant of blk_mq_unfreeze_queue */ +void blk_mq_unfreeze_queue_non_owner(struct request_queue *q) +{ + __blk_mq_unfreeze_queue(q, false); +} +EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner); + +/* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the * mpt3sas driver such that this function can be removed. */ @@ -283,8 +347,9 @@ void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_quiesce_queue_nowait(q); } - blk_mq_wait_quiesce_done(set); mutex_unlock(&set->tag_list_lock); + + blk_mq_wait_quiesce_done(set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); @@ -331,14 +396,9 @@ EXPORT_SYMBOL(blk_rq_init); /* Set start and alloc time when the allocated request is actually used */ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { - if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = blk_time_get_ns(); - else - rq->start_time_ns = 0; - #ifdef CONFIG_BLK_RQ_ALLOC_TIME if (blk_queue_rq_alloc_time(rq->q)) - rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns; + rq->alloc_time_ns = alloc_time_ns; else rq->alloc_time_ns = 0; #endif @@ -359,8 +419,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, if (data->flags & BLK_MQ_REQ_PM) data->rq_flags |= RQF_PM; - if (blk_queue_io_stat(q)) - data->rq_flags |= RQF_IO_STAT; rq->rq_flags = data->rq_flags; if (data->rq_flags & RQF_SCHED_TAGS) { @@ -420,7 +478,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) prefetch(tags->static_rqs[tag]); tag_mask &= ~(1UL << i); rq = blk_mq_rq_ctx_init(data, tags, tag); - rq_list_add(data->cached_rq, rq); + rq_list_add_head(data->cached_rqs, rq); nr++; } if (!(data->rq_flags & RQF_SCHED_TAGS)) @@ -429,7 +487,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; - return rq_list_pop(data->cached_rq); + return rq_list_pop(data->cached_rqs); } static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) @@ -526,7 +584,7 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q, .flags = flags, .cmd_flags = opf, .nr_tags = plug->nr_ios, - .cached_rq = &plug->cached_rq, + .cached_rqs = &plug->cached_rqs, }; struct request *rq; @@ -551,14 +609,14 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, if (!plug) return NULL; - if (rq_list_empty(plug->cached_rq)) { + if (rq_list_empty(&plug->cached_rqs)) { if (plug->nr_ios == 1) return NULL; rq = blk_mq_rq_cache_fill(q, plug, opf, flags); if (!rq) return NULL; } else { - rq = rq_list_peek(&plug->cached_rq); + rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; @@ -567,8 +625,8 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; - plug->cached_rq = rq_list_next(rq); - blk_mq_rq_time_init(rq, 0); + rq_list_pop(&plug->cached_rqs); + blk_mq_rq_time_init(rq, blk_time_get_ns()); } rq->cmd_flags = opf; @@ -744,7 +802,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug) { struct request *rq; - while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) + while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL) blk_mq_free_request(rq); } @@ -764,7 +822,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags); static void blk_account_io_completion(struct request *req, unsigned int bytes) { - if (req->part && blk_do_io_stat(req)) { + if (req->rq_flags & RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); @@ -784,7 +842,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status) blk_op_str(req_op(req)), (__force u32)(req->cmd_flags & ~REQ_OP_MASK), req->nr_phys_segments, - IOPRIO_PRIO_CLASS(req->ioprio)); + IOPRIO_PRIO_CLASS(req_get_ioprio(req))); } /* @@ -982,8 +1040,7 @@ static inline void blk_account_io_done(struct request *req, u64 now) * normal IO on queueing nor completion. Accounting the * containing request is enough. */ - if (blk_do_io_stat(req) && req->part && - !(req->rq_flags & RQF_FLUSH_SEQ)) { + if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); @@ -996,28 +1053,63 @@ static inline void blk_account_io_done(struct request *req, u64 now) } } +static inline bool blk_rq_passthrough_stats(struct request *req) +{ + struct bio *bio = req->bio; + + if (!blk_queue_passthrough_stat(req->q)) + return false; + + /* Requests without a bio do not transfer data. */ + if (!bio) + return false; + + /* + * Stats are accumulated in the bdev, so must have one attached to a + * bio to track stats. Most drivers do not set the bdev for passthrough + * requests, but nvme is one that will set it. + */ + if (!bio->bi_bdev) + return false; + + /* + * We don't know what a passthrough command does, but we know the + * payload size and data direction. Ensuring the size is aligned to the + * block size filters out most commands with payloads that don't + * represent sector access. + */ + if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1)) + return false; + return true; +} + static inline void blk_account_io_start(struct request *req) { trace_block_io_start(req); - if (blk_do_io_stat(req)) { - /* - * All non-passthrough requests are created from a bio with one - * exception: when a flush command that is part of a flush sequence - * generated by the state machine in blk-flush.c is cloned onto the - * lower device by dm-multipath we can get here without a bio. - */ - if (req->bio) - req->part = req->bio->bi_bdev; - else - req->part = req->q->disk->part0; + if (!blk_queue_io_stat(req->q)) + return; + if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req)) + return; - part_stat_lock(); - update_io_ticks(req->part, jiffies, false); - part_stat_local_inc(req->part, - in_flight[op_is_write(req_op(req))]); - part_stat_unlock(); - } + req->rq_flags |= RQF_IO_STAT; + req->start_time_ns = blk_time_get_ns(); + + /* + * All non-passthrough requests are created from a bio with one + * exception: when a flush command that is part of a flush sequence + * generated by the state machine in blk-flush.c is cloned onto the + * lower device by dm-multipath we can get here without a bio. + */ + if (req->bio) + req->part = req->bio->bi_bdev; + else + req->part = req->q->disk->part0; + + part_stat_lock(); + update_io_ticks(req->part, jiffies, false); + part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]); + part_stat_unlock(); } static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) @@ -1300,8 +1392,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) */ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) plug->has_elevator = true; - rq->rq_next = NULL; - rq_list_add(&plug->mq_list, rq); + rq_list_add_tail(&plug->mq_list, rq); plug->rq_count++; } @@ -1698,7 +1789,6 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } -EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); struct dispatch_rq_data { struct blk_mq_hw_ctx *hctx; @@ -2200,6 +2290,24 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); +static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx) +{ + bool need_run; + + /* + * When queue is quiesced, we may be switching io scheduler, or + * updating nr_hw_queues, or other things, and we can't run queue + * any more, even blk_mq_hctx_has_pending() can't be called safely. + * + * And queue will be rerun in blk_mq_unquiesce_queue() if it is + * quiesced. + */ + __blk_mq_run_dispatch_ops(hctx->queue, false, + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx)); + return need_run; +} + /** * blk_mq_run_hw_queue - Start to run a hardware queue. * @hctx: Pointer to the hardware queue to run. @@ -2220,20 +2328,23 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); - /* - * When queue is quiesced, we may be switching io scheduler, or - * updating nr_hw_queues, or other things, and we can't run queue - * any more, even __blk_mq_hctx_has_pending() can't be called safely. - * - * And queue will be rerun in blk_mq_unquiesce_queue() if it is - * quiesced. - */ - __blk_mq_run_dispatch_ops(hctx->queue, false, - need_run = !blk_queue_quiesced(hctx->queue) && - blk_mq_hctx_has_pending(hctx)); + need_run = blk_mq_hw_queue_need_run(hctx); + if (!need_run) { + unsigned long flags; - if (!need_run) - return; + /* + * Synchronize with blk_mq_unquiesce_queue(), because we check + * if hw queue is quiesced locklessly above, we need the use + * ->queue_lock to make sure we see the up-to-date status to + * not miss rerunning the hw queue. + */ + spin_lock_irqsave(&hctx->queue->queue_lock, flags); + need_run = blk_mq_hw_queue_need_run(hctx); + spin_unlock_irqrestore(&hctx->queue->queue_lock, flags); + + if (!need_run) + return; + } if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); @@ -2390,6 +2501,12 @@ void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) return; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + /* + * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the + * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch + * list in the subsequent routine. + */ + smp_mb__after_atomic(); blk_mq_run_hw_queue(hctx, async); } EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); @@ -2542,7 +2659,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->cmd_flags |= REQ_FAILFAST_MASK; rq->__sector = bio->bi_iter.bi_sector; - rq->write_hint = bio->bi_write_hint; blk_rq_bio_prep(rq, bio, nr_segs); if (bio_integrity(bio)) rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, @@ -2620,6 +2736,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, false); return; } @@ -2650,6 +2767,7 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, false); return BLK_STS_OK; } @@ -2666,7 +2784,7 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug) blk_status_t ret = BLK_STS_OK; while ((rq = rq_list_pop(&plug->mq_list))) { - bool last = rq_list_empty(plug->mq_list); + bool last = rq_list_empty(&plug->mq_list); if (hctx != rq->mq_hctx) { if (hctx) { @@ -2709,8 +2827,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) { struct blk_mq_hw_ctx *this_hctx = NULL; struct blk_mq_ctx *this_ctx = NULL; - struct request *requeue_list = NULL; - struct request **requeue_lastp = &requeue_list; + struct rq_list requeue_list = {}; unsigned int depth = 0; bool is_passthrough = false; LIST_HEAD(list); @@ -2724,12 +2841,12 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) is_passthrough = blk_rq_is_passthrough(rq); } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || is_passthrough != blk_rq_is_passthrough(rq)) { - rq_list_add_tail(&requeue_lastp, rq); + rq_list_add_tail(&requeue_list, rq); continue; } - list_add(&rq->queuelist, &list); + list_add_tail(&rq->queuelist, &list); depth++; - } while (!rq_list_empty(plug->mq_list)); + } while (!rq_list_empty(&plug->mq_list)); plug->mq_list = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); @@ -2784,19 +2901,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) if (q->mq_ops->queue_rqs) { blk_mq_run_dispatch_ops(q, __blk_mq_flush_plug_list(q, plug)); - if (rq_list_empty(plug->mq_list)) + if (rq_list_empty(&plug->mq_list)) return; } blk_mq_run_dispatch_ops(q, blk_mq_plug_issue_direct(plug)); - if (rq_list_empty(plug->mq_list)) + if (rq_list_empty(&plug->mq_list)) return; } do { blk_mq_dispatch_plug_list(plug, from_schedule); - } while (!rq_list_empty(plug->mq_list)); + } while (!rq_list_empty(&plug->mq_list)); } static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, @@ -2861,7 +2978,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; - data.cached_rq = &plug->cached_rq; + data.cached_rqs = &plug->cached_rqs; } rq = __blk_mq_alloc_requests(&data); @@ -2884,7 +3001,7 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, if (!plug) return NULL; - rq = rq_list_peek(&plug->cached_rq); + rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (type != rq->mq_hctx->type && @@ -2898,17 +3015,17 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { - WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); + if (rq_list_pop(&plug->cached_rqs) != rq) + WARN_ON_ONCE(1); /* * If any qos ->throttle() end up blocking, we will have flushed the * plug and hence killed the cached_rq list as well. Pop this entry * before we throttle. */ - plug->cached_rq = rq_list_next(rq); rq_qos_throttle(rq->q, bio); - blk_mq_rq_time_init(rq, 0); + blk_mq_rq_time_init(rq, blk_time_get_ns()); rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); } @@ -3187,8 +3304,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; - rq->ioprio = rq_src->ioprio; - rq->write_hint = rq_src->write_hint; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; diff --git a/block/blk-mq.h b/block/blk-mq.h index 3bd43b10032f..89a20fffa4b1 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -155,7 +155,7 @@ struct blk_mq_alloc_data { /* allocate multiple requests/tags in one go */ unsigned int nr_tags; - struct request **cached_rq; + struct rq_list *cached_rqs; /* input & output parameter */ struct blk_mq_ctx *ctx; @@ -230,6 +230,19 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { + /* Fast path: hardware queue is not stopped most of the time. */ + if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + return false; + + /* + * This barrier is used to order adding of dispatch list before and + * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier + * in blk_mq_start_stopped_hw_queue() so that dispatch code could + * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not + * empty to avoid missing dispatching requests. + */ + smp_mb(); + return test_bit(BLK_MQ_S_STOPPED, &hctx->state); } diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 058f92c4f9d5..eb9618cd68ad 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -218,7 +218,6 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr, return -1; data->got_token = true; - smp_wmb(); wake_up_process(data->task); list_del_init_careful(&curr->entry); return 1; @@ -274,10 +273,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, * which means we now have two. Put our local token * and wake anyone else potentially waiting for one. */ - smp_rmb(); if (data.got_token) cleanup_cb(rqw, private_data); - break; + return; } io_schedule(); has_sleeper = true; diff --git a/block/blk-settings.c b/block/blk-settings.c index a446654ddee5..f1d4dfdc37a7 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -50,7 +50,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; - lim->max_zone_append_sectors = UINT_MAX; + lim->max_hw_zone_append_sectors = UINT_MAX; lim->max_user_discard_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -91,17 +91,16 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) if (lim->zone_write_granularity < lim->logical_block_size) lim->zone_write_granularity = lim->logical_block_size; - if (lim->max_zone_append_sectors) { - /* - * The Zone Append size is limited by the maximum I/O size - * and the zone size given that it can't span zones. - */ - lim->max_zone_append_sectors = - min3(lim->max_hw_sectors, - lim->max_zone_append_sectors, - lim->chunk_sectors); - } - + /* + * The Zone Append size is limited by the maximum I/O size and the zone + * size given that it can't span zones. + * + * If no max_hw_zone_append_sectors limit is provided, the block layer + * will emulated it, else we're also bound by the hardware limit. + */ + lim->max_zone_append_sectors = + min_not_zero(lim->max_hw_zone_append_sectors, + min(lim->chunk_sectors, lim->max_hw_sectors)); return 0; } @@ -223,7 +222,7 @@ unsupported: * Check that the limits in lim are valid, initialize defaults for unset * values, and cap values based on others where needed. */ -static int blk_validate_limits(struct queue_limits *lim) +int blk_validate_limits(struct queue_limits *lim) { unsigned int max_hw_sectors; unsigned int logical_block_sectors; @@ -366,6 +365,7 @@ static int blk_validate_limits(struct queue_limits *lim) return err; return blk_validate_zoned_limits(lim); } +EXPORT_SYMBOL_GPL(blk_validate_limits); /* * Set the default limits for a newly allocated queue. @lim contains the @@ -508,10 +508,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->features |= (b->features & BLK_FEAT_INHERIT_MASK); /* - * BLK_FEAT_NOWAIT and BLK_FEAT_POLL need to be supported both by the - * stacking driver and all underlying devices. The stacking driver sets - * the flags before stacking the limits, and this will clear the flags - * if any of the underlying devices does not support it. + * Some feaures need to be supported both by the stacking driver and all + * underlying devices. The stacking driver sets these flags before + * stacking the limits, and this will clear the flags if any of the + * underlying devices does not support it. */ if (!(b->features & BLK_FEAT_NOWAIT)) t->features &= ~BLK_FEAT_NOWAIT; @@ -527,8 +527,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); - t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t), - queue_limits_max_zone_append_sectors(b)); + t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors, + b->max_hw_zone_append_sectors); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); @@ -661,7 +661,7 @@ EXPORT_SYMBOL(blk_stack_limits); void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx) { - if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits, + if (blk_stack_limits(t, bdev_limits(bdev), get_start_sect(bdev) + offset)) pr_notice("%s: Warning: Device %pg is misaligned\n", pfx, bdev); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e85941bec857..d80a202cd170 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -23,14 +23,14 @@ struct queue_sysfs_entry { struct attribute attr; ssize_t (*show)(struct gendisk *disk, char *page); - int (*load_module)(struct gendisk *disk, const char *page, size_t count); ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); + void (*load_module)(struct gendisk *disk, const char *page, size_t count); }; static ssize_t queue_var_show(unsigned long var, char *page) { - return sprintf(page, "%lu\n", var); + return sysfs_emit(page, "%lu\n", var); } static ssize_t @@ -121,7 +121,7 @@ QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_max) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ - return sprintf(page, "%llu\n", \ + return sysfs_emit(page, "%llu\n", \ (unsigned long long)disk->queue->limits._field << \ SECTOR_SHIFT); \ } @@ -131,6 +131,7 @@ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ @@ -144,7 +145,7 @@ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors) #define QUEUE_SYSFS_SHOW_CONST(_name, _val) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ - return sprintf(page, "%d\n", _val); \ + return sysfs_emit(page, "%d\n", _val); \ } /* deprecated fields */ @@ -178,18 +179,6 @@ static ssize_t queue_max_discard_sectors_store(struct gendisk *disk, return ret; } -/* - * For zone append queue_max_zone_append_sectors does not just return the - * underlying queue limits, but actually contains a calculation. Because of - * that we can't simply use QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES here. - */ -static ssize_t queue_zone_append_max_show(struct gendisk *disk, char *page) -{ - return sprintf(page, "%llu\n", - (u64)queue_max_zone_append_sectors(disk->queue) << - SECTOR_SHIFT); -} - static ssize_t queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count) { @@ -235,7 +224,7 @@ static ssize_t queue_feature_store(struct gendisk *disk, const char *page, #define QUEUE_SYSFS_FEATURE(_name, _feature) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ - return sprintf(page, "%u\n", \ + return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } \ static ssize_t queue_##_name##_store(struct gendisk *disk, \ @@ -252,7 +241,7 @@ QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES); #define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ - return sprintf(page, "%u\n", \ + return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } @@ -263,8 +252,8 @@ QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); static ssize_t queue_zoned_show(struct gendisk *disk, char *page) { if (blk_queue_is_zoned(disk->queue)) - return sprintf(page, "host-managed\n"); - return sprintf(page, "none\n"); + return sysfs_emit(page, "host-managed\n"); + return sysfs_emit(page, "none\n"); } static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) @@ -272,6 +261,34 @@ static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) return queue_var_show(disk_nr_zones(disk), page); } +static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) +{ + return queue_var_show(blk_queue_passthrough_stat(disk->queue), page); +} + +static ssize_t queue_iostats_passthrough_store(struct gendisk *disk, + const char *page, size_t count) +{ + struct queue_limits lim; + unsigned long ios; + ssize_t ret; + + ret = queue_var_store(&ios, page, count); + if (ret < 0) + return ret; + + lim = queue_limits_start_update(disk->queue); + if (ios) + lim.flags |= BLK_FLAG_IOSTATS_PASSTHROUGH; + else + lim.flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH; + + ret = queue_limits_commit_update(disk->queue, &lim); + if (ret) + return ret; + + return count; +} static ssize_t queue_nomerges_show(struct gendisk *disk, char *page) { return queue_var_show((blk_queue_nomerges(disk->queue) << 1) | @@ -349,7 +366,7 @@ static ssize_t queue_poll_store(struct gendisk *disk, const char *page, static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) { - return sprintf(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout)); + return sysfs_emit(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout)); } static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, @@ -370,8 +387,8 @@ static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, static ssize_t queue_wc_show(struct gendisk *disk, char *page) { if (blk_queue_write_cache(disk->queue)) - return sprintf(page, "write back\n"); - return sprintf(page, "write through\n"); + return sysfs_emit(page, "write back\n"); + return sysfs_emit(page, "write through\n"); } static ssize_t queue_wc_store(struct gendisk *disk, const char *page, @@ -451,7 +468,7 @@ QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); -QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); +QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_RO_ENTRY(queue_zoned, "zoned"); @@ -460,6 +477,7 @@ QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones"); QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); +QUEUE_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); QUEUE_RW_ENTRY(queue_poll, "io_poll"); QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); @@ -501,9 +519,9 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) return -EINVAL; if (wbt_disabled(disk->queue)) - return sprintf(page, "0\n"); + return sysfs_emit(page, "0\n"); - return sprintf(page, "%llu\n", + return sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(disk->queue), 1000)); } @@ -578,7 +596,7 @@ static struct attribute *queue_attrs[] = { &queue_atomic_write_unit_max_entry.attr, &queue_write_same_max_entry.attr, &queue_max_write_zeroes_sectors_entry.attr, - &queue_zone_append_max_entry.attr, + &queue_max_zone_append_sectors_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, &queue_zoned_entry.attr, @@ -586,6 +604,7 @@ static struct attribute *queue_attrs[] = { &queue_max_open_zones_entry.attr, &queue_max_active_zones_entry.attr, &queue_nomerges_entry.attr, + &queue_iostats_passthrough_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, &queue_add_random_entry.attr, @@ -684,11 +703,8 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, * queue to ensure that the module file can be read when the request * queue is the one for the device storing the module file. */ - if (entry->load_module) { - res = entry->load_module(disk, page, length); - if (res) - return res; - } + if (entry->load_module) + entry->load_module(disk, page, length); blk_mq_freeze_queue(q); mutex_lock(&q->sysfs_lock); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 2c4192e12efa..82dbaefcfa3b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1485,13 +1485,13 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, goto out_finish; ret = -EINVAL; - if (!strcmp(tok, "rbps") && val > 1) + if (!strcmp(tok, "rbps")) v[0] = val; - else if (!strcmp(tok, "wbps") && val > 1) + else if (!strcmp(tok, "wbps")) v[1] = val; - else if (!strcmp(tok, "riops") && val > 1) + else if (!strcmp(tok, "riops")) v[2] = min_t(u64, val, UINT_MAX); - else if (!strcmp(tok, "wiops") && val > 1) + else if (!strcmp(tok, "wiops")) v[3] = min_t(u64, val, UINT_MAX); else goto out_finish; @@ -1526,6 +1526,42 @@ static void throtl_shutdown_wq(struct request_queue *q) cancel_work_sync(&td->dispatch_work); } +static void tg_flush_bios(struct throtl_grp *tg) +{ + struct throtl_service_queue *sq = &tg->service_queue; + + if (tg->flags & THROTL_TG_CANCELING) + return; + /* + * Set the flag to make sure throtl_pending_timer_fn() won't + * stop until all throttled bios are dispatched. + */ + tg->flags |= THROTL_TG_CANCELING; + + /* + * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup + * will be inserted to service queue without THROTL_TG_PENDING + * set in tg_update_disptime below. Then IO dispatched from + * child in tg_dispatch_one_bio will trigger double insertion + * and corrupt the tree. + */ + if (!(tg->flags & THROTL_TG_PENDING)) + return; + + /* + * Update disptime after setting the above flag to make sure + * throtl_select_dispatch() won't exit without dispatching. + */ + tg_update_disptime(tg); + + throtl_schedule_pending_timer(sq, jiffies + 1); +} + +static void throtl_pd_offline(struct blkg_policy_data *pd) +{ + tg_flush_bios(pd_to_tg(pd)); +} + struct blkcg_policy blkcg_policy_throtl = { .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, @@ -1533,6 +1569,7 @@ struct blkcg_policy blkcg_policy_throtl = { .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, + .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, }; @@ -1553,32 +1590,15 @@ void blk_throtl_cancel_bios(struct gendisk *disk) */ rcu_read_lock(); blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - struct throtl_service_queue *sq = &tg->service_queue; - - /* - * Set the flag to make sure throtl_pending_timer_fn() won't - * stop until all throttled bios are dispatched. - */ - tg->flags |= THROTL_TG_CANCELING; - /* - * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup - * will be inserted to service queue without THROTL_TG_PENDING - * set in tg_update_disptime below. Then IO dispatched from - * child in tg_dispatch_one_bio will trigger double insertion - * and corrupt the tree. + * disk_release will call pd_offline_fn to cancel bios. + * However, disk_release can't be called if someone get + * the refcount of device and issued bios which are + * inflight after del_gendisk. + * Cancel bios here to ensure no bios are inflight after + * del_gendisk. */ - if (!(tg->flags & THROTL_TG_PENDING)) - continue; - - /* - * Update disptime after setting the above flag to make sure - * throtl_select_dispatch() won't exit without dispatching. - */ - tg_update_disptime(tg); - - throtl_schedule_pending_timer(sq, jiffies + 1); + tg_flush_bios(blkg_to_tg(blkg)); } rcu_read_unlock(); spin_unlock_irq(&q->queue_lock); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index af19296fa50d..70211751df16 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -18,7 +18,7 @@ #include <linux/vmalloc.h> #include <linux/sched/mm.h> #include <linux/spinlock.h> -#include <linux/atomic.h> +#include <linux/refcount.h> #include <linux/mempool.h> #include "blk.h" @@ -64,7 +64,7 @@ static const char *const zone_cond_name[] = { struct blk_zone_wplug { struct hlist_node node; struct list_head link; - atomic_t ref; + refcount_t ref; spinlock_t lock; unsigned int flags; unsigned int zone_no; @@ -348,13 +348,6 @@ fail: return ret; } -static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector) -{ - if (!disk->conv_zones_bitmap) - return false; - return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); -} - static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) { return zone->start + zone->len >= get_capacity(disk); @@ -411,7 +404,7 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { if (zwplug->zone_no == zno && - atomic_inc_not_zero(&zwplug->ref)) { + refcount_inc_not_zero(&zwplug->ref)) { rcu_read_unlock(); return zwplug; } @@ -432,7 +425,7 @@ static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) { - if (atomic_dec_and_test(&zwplug->ref)) { + if (refcount_dec_and_test(&zwplug->ref)) { WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); WARN_ON_ONCE(!list_empty(&zwplug->link)); WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); @@ -463,7 +456,7 @@ static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, * taken when the plug was allocated and another reference taken by the * caller context). */ - if (atomic_read(&zwplug->ref) > 2) + if (refcount_read(&zwplug->ref) > 2) return false; /* We can remove zone write plugs for zones that are empty or full. */ @@ -533,7 +526,7 @@ again: INIT_HLIST_NODE(&zwplug->node); INIT_LIST_HEAD(&zwplug->link); - atomic_set(&zwplug->ref, 2); + refcount_set(&zwplug->ref, 2); spin_lock_init(&zwplug->lock); zwplug->flags = 0; zwplug->zone_no = zno; @@ -624,7 +617,7 @@ static inline void disk_zone_wplug_set_error(struct gendisk *disk, * finished. */ zwplug->flags |= BLK_ZONE_WPLUG_ERROR; - atomic_inc(&zwplug->ref); + refcount_inc(&zwplug->ref); spin_lock_irqsave(&disk->zone_wplugs_lock, flags); list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); @@ -709,7 +702,7 @@ static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, struct blk_zone_wplug *zwplug; /* Conventional zones cannot be reset nor finished. */ - if (disk_zone_is_conv(disk, sector)) { + if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { bio_io_error(bio); return true; } @@ -963,7 +956,7 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) } /* Conventional zones do not need write plugging. */ - if (disk_zone_is_conv(disk, sector)) { + if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { /* Zone append to conventional zones is not allowed. */ if (bio_op(bio) == REQ_OP_ZONE_APPEND) { bio_io_error(bio); @@ -1099,7 +1092,7 @@ static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, * reference we take here. */ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); - atomic_inc(&zwplug->ref); + refcount_inc(&zwplug->ref); queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); } @@ -1444,7 +1437,7 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) while (!hlist_empty(&disk->zone_wplugs_hash[i])) { zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, struct blk_zone_wplug, node); - atomic_inc(&zwplug->ref); + refcount_inc(&zwplug->ref); disk_remove_zone_wplug(disk, zwplug); disk_put_zone_wplug(zwplug); } @@ -1455,6 +1448,24 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) disk->zone_wplugs_hash_bits = 0; } +static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk, + unsigned long *bitmap) +{ + unsigned int nr_conv_zones = 0; + unsigned long flags; + + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + if (bitmap) + nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones); + bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap, + lockdep_is_held(&disk->zone_wplugs_lock)); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + + kfree_rcu_mightsleep(bitmap); + + return nr_conv_zones; +} + void disk_free_zone_resources(struct gendisk *disk) { if (!disk->zone_wplugs_pool) @@ -1478,8 +1489,7 @@ void disk_free_zone_resources(struct gendisk *disk) mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL; - bitmap_free(disk->conv_zones_bitmap); - disk->conv_zones_bitmap = NULL; + disk_set_conv_zones_bitmap(disk, NULL); disk->zone_capacity = 0; disk->last_zone_capacity = 0; disk->nr_zones = 0; @@ -1538,17 +1548,15 @@ static int disk_update_zone_resources(struct gendisk *disk, struct blk_revalidate_zone_args *args) { struct request_queue *q = disk->queue; - unsigned int nr_seq_zones, nr_conv_zones = 0; + unsigned int nr_seq_zones, nr_conv_zones; unsigned int pool_size; struct queue_limits lim; disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; disk->last_zone_capacity = args->last_zone_capacity; - swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); - if (disk->conv_zones_bitmap) - nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, - disk->nr_zones); + nr_conv_zones = + disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap); if (nr_conv_zones >= disk->nr_zones) { pr_warn("%s: Invalid number of conventional zones %u / %u\n", disk->disk_name, nr_conv_zones, disk->nr_zones); @@ -1774,12 +1782,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk) return -ENODEV; } - if (!queue_max_zone_append_sectors(q)) { - pr_warn("%s: Invalid 0 maximum zone append limit\n", - disk->disk_name); - return -ENODEV; - } - /* * Ensure that all memory allocations in this context are done as if * GFP_NOIO was specified. @@ -1823,8 +1825,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk) disk_free_zone_resources(disk); blk_mq_unfreeze_queue(q); - kfree(args.conv_zones_bitmap); - return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); @@ -1851,7 +1851,7 @@ int queue_zone_wplugs_show(void *data, struct seq_file *m) spin_lock_irqsave(&zwplug->lock, flags); zwp_zone_no = zwplug->zone_no; zwp_flags = zwplug->flags; - zwp_ref = atomic_read(&zwplug->ref); + zwp_ref = refcount_read(&zwplug->ref); zwp_wp_offset = zwplug->wp_offset; zwp_bio_list_size = bio_list_size(&zwplug->bio_list); spin_unlock_irqrestore(&zwplug->lock, flags); diff --git a/block/blk.h b/block/blk.h index c718e4291db0..2c26abf505b8 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,7 @@ #include <linux/bio-integrity.h> #include <linux/blk-crypto.h> +#include <linux/lockdep.h> #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ #include <linux/sched/sysctl.h> #include <linux/timekeeping.h> @@ -34,9 +35,10 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); -void blk_freeze_queue(struct request_queue *q); -void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); -void blk_queue_start_drain(struct request_queue *q); +bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); +bool blk_queue_start_drain(struct request_queue *q); +bool __blk_freeze_queue_start(struct request_queue *q, + struct task_struct *owner); int __bio_queue_enter(struct request_queue *q, struct bio *bio); void submit_bio_noacct_nocheck(struct bio *bio); void bio_await_chain(struct bio *bio); @@ -69,8 +71,11 @@ static inline int bio_queue_enter(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (blk_try_enter_queue(q, false)) + if (blk_try_enter_queue(q, false)) { + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; + } return __bio_queue_enter(q, bio); } @@ -405,17 +410,6 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi, struct queue_limits *lim); int blk_dev_init(void); -/* - * Contribute to IO statistics IFF: - * - * a) it's attached to a gendisk, and - * b) the queue had IO stats enabled when this request was started - */ -static inline bool blk_do_io_stat(struct request *rq) -{ - return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq); -} - void update_io_ticks(struct block_device *part, unsigned long now, bool end); unsigned int part_in_flight(struct block_device *part); @@ -463,11 +457,6 @@ static inline bool bio_zone_write_plugging(struct bio *bio) { return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); } -static inline bool bio_is_zone_append(struct bio *bio) -{ - return bio_op(bio) == REQ_OP_ZONE_APPEND || - bio_flagged(bio, BIO_EMULATES_ZONE_APPEND); -} void blk_zone_write_plug_bio_merged(struct bio *bio); void blk_zone_write_plug_init_request(struct request *rq); static inline void blk_zone_update_request_bio(struct request *rq, @@ -516,10 +505,6 @@ static inline bool bio_zone_write_plugging(struct bio *bio) { return false; } -static inline bool bio_is_zone_append(struct bio *bio) -{ - return false; -} static inline void blk_zone_write_plug_bio_merged(struct bio *bio) { } @@ -558,6 +543,7 @@ void blk_free_ext_minor(unsigned int minor); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 +#define ADDPART_FLAG_READONLY 4 int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); int bdev_del_partition(struct gendisk *disk, int partno); @@ -734,4 +720,22 @@ void blk_integrity_verify(struct bio *bio); void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); +static inline void blk_freeze_acquire_lock(struct request_queue *q, bool + disk_dead, bool queue_dying) +{ + if (!disk_dead) + rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_); + if (!queue_dying) + rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_); +} + +static inline void blk_unfreeze_release_lock(struct request_queue *q, bool + disk_dead, bool queue_dying) +{ + if (!queue_dying) + rwsem_release(&q->q_lockdep_map, _RET_IP_); + if (!disk_dead) + rwsem_release(&q->io_lockdep_map, _RET_IP_); +} + #endif /* BLK_INTERNAL_H */ diff --git a/block/elevator.c b/block/elevator.c index 9430cde13d1a..7c3ba80e5ff4 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -598,13 +598,19 @@ void elevator_init_mq(struct request_queue *q) * drain any dispatch activities originated from passthrough * requests, then no need to quiesce queue which may add long boot * latency, especially when lots of disks are involved. + * + * Disk isn't added yet, so verifying queue lock only manually. */ - blk_mq_freeze_queue(q); + blk_freeze_queue_start_non_owner(q); + blk_freeze_acquire_lock(q, true, false); + blk_mq_freeze_queue_wait(q); + blk_mq_cancel_work_sync(q); err = blk_mq_init_sched(q, e); - blk_mq_unfreeze_queue(q); + blk_unfreeze_release_lock(q, true, false); + blk_mq_unfreeze_queue_non_owner(q); if (err) { pr_warn("\"%s\" elevator initialization failed, " @@ -704,15 +710,15 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) return ret; } -int elv_iosched_load_module(struct gendisk *disk, const char *buf, - size_t count) +void elv_iosched_load_module(struct gendisk *disk, const char *buf, + size_t count) { char elevator_name[ELV_NAME_MAX]; struct elevator_type *found; const char *name; if (!elv_support_iosched(disk->queue)) - return -EOPNOTSUPP; + return; strscpy(elevator_name, buf, sizeof(elevator_name)); name = strstrip(elevator_name); @@ -723,8 +729,6 @@ int elv_iosched_load_module(struct gendisk *disk, const char *buf, if (!found) request_module("%s-iosched", name); - - return 0; } ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, diff --git a/block/elevator.h b/block/elevator.h index 2a78544bf201..dbf357ef4fab 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -148,8 +148,8 @@ extern void elv_unregister(struct elevator_type *); * io scheduler sysfs switching */ ssize_t elv_iosched_show(struct gendisk *disk, char *page); -int elv_iosched_load_module(struct gendisk *disk, const char *page, - size_t count); +void elv_iosched_load_module(struct gendisk *disk, const char *page, + size_t count); ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); diff --git a/block/genhd.c b/block/genhd.c index 1c05dd4c6980..9130e163e191 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -383,16 +383,18 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) } /** - * device_add_disk - add disk information to kernel list + * add_disk_fwnode - add disk information to kernel list with fwnode * @parent: parent device for the disk * @disk: per-device partitioning information * @groups: Additional per-device sysfs groups + * @fwnode: attached disk fwnode * * This function registers the partitioning information in @disk - * with the kernel. + * with the kernel. Also attach a fwnode to the disk device. */ -int __must_check device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) +int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, + struct fwnode_handle *fwnode) { struct device *ddev = disk_to_dev(disk); @@ -452,6 +454,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, ddev->parent = parent; ddev->groups = groups; dev_set_name(ddev, "%s", disk->disk_name); + if (fwnode) + device_set_node(ddev, fwnode); if (!(disk->flags & GENHD_FL_HIDDEN)) ddev->devt = MKDEV(disk->major, disk->first_minor); ret = device_add(ddev); @@ -553,6 +557,22 @@ out_exit_elevator: elevator_exit(disk->queue); return ret; } +EXPORT_SYMBOL_GPL(add_disk_fwnode); + +/** + * device_add_disk - add disk information to kernel list + * @parent: parent device for the disk + * @disk: per-device partitioning information + * @groups: Additional per-device sysfs groups + * + * This function registers the partitioning information in @disk + * with the kernel. + */ +int __must_check device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) +{ + return add_disk_fwnode(parent, disk, groups, NULL); +} EXPORT_SYMBOL(device_add_disk); static void blk_report_disk_dead(struct gendisk *disk, bool surprise) @@ -581,13 +601,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise) rcu_read_unlock(); } -static void __blk_mark_disk_dead(struct gendisk *disk) +static bool __blk_mark_disk_dead(struct gendisk *disk) { /* * Fail any new I/O. */ if (test_and_set_bit(GD_DEAD, &disk->state)) - return; + return false; if (test_bit(GD_OWNS_QUEUE, &disk->state)) blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); @@ -600,7 +620,7 @@ static void __blk_mark_disk_dead(struct gendisk *disk) /* * Prevent new I/O from crossing bio_queue_enter(). */ - blk_queue_start_drain(disk->queue); + return blk_queue_start_drain(disk->queue); } /** @@ -641,6 +661,7 @@ void del_gendisk(struct gendisk *disk) struct request_queue *q = disk->queue; struct block_device *part; unsigned long idx; + bool start_drain, queue_dying; might_sleep(); @@ -668,7 +689,10 @@ void del_gendisk(struct gendisk *disk) * Drop all partitions now that the disk is marked dead. */ mutex_lock(&disk->open_mutex); - __blk_mark_disk_dead(disk); + start_drain = __blk_mark_disk_dead(disk); + queue_dying = blk_queue_dying(q); + if (start_drain) + blk_freeze_acquire_lock(q, true, queue_dying); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); @@ -725,6 +749,9 @@ void del_gendisk(struct gendisk *disk) if (queue_is_mq(q)) blk_mq_exit_queue(q); } + + if (start_drain) + blk_unfreeze_release_lock(q, true, queue_dying); } EXPORT_SYMBOL(del_gendisk); @@ -756,7 +783,7 @@ static ssize_t disk_badblocks_show(struct device *dev, struct gendisk *disk = dev_to_disk(dev); if (!disk->bb) - return sprintf(page, "\n"); + return sysfs_emit(page, "\n"); return badblocks_show(disk->bb, page, 0); } @@ -904,7 +931,7 @@ static ssize_t disk_range_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", disk->minors); + return sysfs_emit(buf, "%d\n", disk->minors); } static ssize_t disk_ext_range_show(struct device *dev, @@ -912,7 +939,7 @@ static ssize_t disk_ext_range_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", + return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS); } @@ -921,7 +948,7 @@ static ssize_t disk_removable_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", + return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); } @@ -930,7 +957,7 @@ static ssize_t disk_hidden_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", + return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_HIDDEN ? 1 : 0)); } @@ -939,13 +966,13 @@ static ssize_t disk_ro_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); + return sysfs_emit(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); + return sysfs_emit(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); } ssize_t part_stat_show(struct device *dev, @@ -962,7 +989,7 @@ ssize_t part_stat_show(struct device *dev, part_stat_unlock(); } part_stat_read_all(bdev, &stat); - return sprintf(buf, + return sysfs_emit(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " "%8u %8u %8u " @@ -1004,14 +1031,14 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, else part_in_flight_rw(bdev, inflight); - return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); + return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]); } static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { dev_warn_once(dev, "the capability attribute has been deprecated.\n"); - return sprintf(buf, "0\n"); + return sysfs_emit(buf, "0\n"); } static ssize_t disk_alignment_offset_show(struct device *dev, @@ -1020,7 +1047,7 @@ static ssize_t disk_alignment_offset_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); + return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t disk_discard_alignment_show(struct device *dev, @@ -1029,7 +1056,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); + return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t diskseq_show(struct device *dev, @@ -1037,13 +1064,13 @@ static ssize_t diskseq_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%llu\n", disk->diskseq); + return sysfs_emit(buf, "%llu\n", disk->diskseq); } static ssize_t partscan_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); + return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); } static DEVICE_ATTR(range, 0444, disk_range_show, NULL); @@ -1065,7 +1092,7 @@ static DEVICE_ATTR(partscan, 0444, partscan_show, NULL); ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", + return sysfs_emit(buf, "%d\n", bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL)); } @@ -1264,40 +1291,35 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_unlock(); } part_stat_read_all(hd, &stat); - seq_printf(seqf, "%4d %7d %pg " - "%lu %lu %lu %u " - "%lu %lu %lu %u " - "%u %u %u " - "%lu %lu %lu %u " - "%lu %u" - "\n", - MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd, - stat.ios[STAT_READ], - stat.merges[STAT_READ], - stat.sectors[STAT_READ], - (unsigned int)div_u64(stat.nsecs[STAT_READ], - NSEC_PER_MSEC), - stat.ios[STAT_WRITE], - stat.merges[STAT_WRITE], - stat.sectors[STAT_WRITE], - (unsigned int)div_u64(stat.nsecs[STAT_WRITE], - NSEC_PER_MSEC), - inflight, - jiffies_to_msecs(stat.io_ticks), - (unsigned int)div_u64(stat.nsecs[STAT_READ] + - stat.nsecs[STAT_WRITE] + - stat.nsecs[STAT_DISCARD] + - stat.nsecs[STAT_FLUSH], - NSEC_PER_MSEC), - stat.ios[STAT_DISCARD], - stat.merges[STAT_DISCARD], - stat.sectors[STAT_DISCARD], - (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], - NSEC_PER_MSEC), - stat.ios[STAT_FLUSH], - (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], - NSEC_PER_MSEC) - ); + seq_put_decimal_ull_width(seqf, "", MAJOR(hd->bd_dev), 4); + seq_put_decimal_ull_width(seqf, " ", MINOR(hd->bd_dev), 7); + seq_printf(seqf, " %pg", hd); + seq_put_decimal_ull(seqf, " ", stat.ios[STAT_READ]); + seq_put_decimal_ull(seqf, " ", stat.merges[STAT_READ]); + seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_READ]); + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ], + NSEC_PER_MSEC)); + seq_put_decimal_ull(seqf, " ", stat.ios[STAT_WRITE]); + seq_put_decimal_ull(seqf, " ", stat.merges[STAT_WRITE]); + seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_WRITE]); + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_WRITE], + NSEC_PER_MSEC)); + seq_put_decimal_ull(seqf, " ", inflight); + seq_put_decimal_ull(seqf, " ", jiffies_to_msecs(stat.io_ticks)); + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC)); + seq_put_decimal_ull(seqf, " ", stat.ios[STAT_DISCARD]); + seq_put_decimal_ull(seqf, " ", stat.merges[STAT_DISCARD]); + seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_DISCARD]); + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], + NSEC_PER_MSEC)); + seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]); + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC)); + seq_putc(seqf, '\n'); } rcu_read_unlock(); diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 7aff4eb81c60..ce17e41451af 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -270,4 +270,13 @@ config CMDLINE_PARTITION Say Y here if you want to read the partition table from bootargs. The format for the command line is just like mtdparts. +config OF_PARTITION + bool "Device Tree partition support" if PARTITION_ADVANCED + depends on OF + help + Say Y here if you want to enable support for partition table + defined in Device Tree. (mainly for eMMC) + The format for the device tree node is just like MTD fixed-partition + schema. + endmenu diff --git a/block/partitions/Makefile b/block/partitions/Makefile index a7f05cdb02a8..25d424922c6e 100644 --- a/block/partitions/Makefile +++ b/block/partitions/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o obj-$(CONFIG_MAC_PARTITION) += mac.o obj-$(CONFIG_LDM_PARTITION) += ldm.o obj-$(CONFIG_MSDOS_PARTITION) += msdos.o +obj-$(CONFIG_OF_PARTITION) += of.o obj-$(CONFIG_OSF_PARTITION) += osf.o obj-$(CONFIG_SGI_PARTITION) += sgi.o obj-$(CONFIG_SUN_PARTITION) += sun.o diff --git a/block/partitions/check.h b/block/partitions/check.h index 8d70a880c372..e5c1c61eb353 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -62,6 +62,7 @@ int karma_partition(struct parsed_partitions *state); int ldm_partition(struct parsed_partitions *state); int mac_partition(struct parsed_partitions *state); int msdos_partition(struct parsed_partitions *state); +int of_partition(struct parsed_partitions *state); int osf_partition(struct parsed_partitions *state); int sgi_partition(struct parsed_partitions *state); int sun_partition(struct parsed_partitions *state); diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 152c85df92b2..da3e719d8e51 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -237,6 +237,9 @@ static int add_part(int slot, struct cmdline_subpart *subpart, put_partition(state, slot, subpart->from >> 9, subpart->size >> 9); + if (subpart->flags & PF_RDONLY) + state->parts[slot].flags |= ADDPART_FLAG_READONLY; + info = &state->parts[slot].info; strscpy(info->volname, subpart->name, sizeof(info->volname)); diff --git a/block/partitions/core.c b/block/partitions/core.c index 5bd7a603092e..815ed33caa1b 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -43,6 +43,9 @@ static int (*const check_part[])(struct parsed_partitions *) = { #ifdef CONFIG_CMDLINE_PARTITION cmdline_partition, #endif +#ifdef CONFIG_OF_PARTITION + of_partition, /* cmdline have priority to OF */ +#endif #ifdef CONFIG_EFI_PARTITION efi_partition, /* this must come before msdos */ #endif @@ -253,6 +256,8 @@ static int part_uevent(const struct device *dev, struct kobj_uevent_env *env) add_uevent_var(env, "PARTN=%u", bdev_partno(part)); if (part->bd_meta_info && part->bd_meta_info->volname[0]) add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); + if (part->bd_meta_info && part->bd_meta_info->uuid[0]) + add_uevent_var(env, "PARTUUID=%s", part->bd_meta_info->uuid); return 0; } @@ -373,6 +378,9 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, goto out_del; } + if (flags & ADDPART_FLAG_READONLY) + bdev_set_flag(bdev, BD_READ_ONLY); + /* everything is up and running, commence */ err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL); if (err) diff --git a/block/partitions/of.c b/block/partitions/of.c new file mode 100644 index 000000000000..4e760fdffb3f --- /dev/null +++ b/block/partitions/of.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/blkdev.h> +#include <linux/major.h> +#include <linux/of.h> +#include <linux/string.h> +#include "check.h" + +static int validate_of_partition(struct device_node *np, int slot) +{ + u64 offset, size; + int len; + + const __be32 *reg = of_get_property(np, "reg", &len); + int a_cells = of_n_addr_cells(np); + int s_cells = of_n_size_cells(np); + + /* Make sure reg len match the expected addr and size cells */ + if (len / sizeof(*reg) != a_cells + s_cells) + return -EINVAL; + + /* Validate offset conversion from bytes to sectors */ + offset = of_read_number(reg, a_cells); + if (offset % SECTOR_SIZE) + return -EINVAL; + + /* Validate size conversion from bytes to sectors */ + size = of_read_number(reg + a_cells, s_cells); + if (!size || size % SECTOR_SIZE) + return -EINVAL; + + return 0; +} + +static void add_of_partition(struct parsed_partitions *state, int slot, + struct device_node *np) +{ + struct partition_meta_info *info; + char tmp[sizeof(info->volname) + 4]; + const char *partname; + int len; + + const __be32 *reg = of_get_property(np, "reg", &len); + int a_cells = of_n_addr_cells(np); + int s_cells = of_n_size_cells(np); + + /* Convert bytes to sector size */ + u64 offset = of_read_number(reg, a_cells) / SECTOR_SIZE; + u64 size = of_read_number(reg + a_cells, s_cells) / SECTOR_SIZE; + + put_partition(state, slot, offset, size); + + if (of_property_read_bool(np, "read-only")) + state->parts[slot].flags |= ADDPART_FLAG_READONLY; + + /* + * Follow MTD label logic, search for label property, + * fallback to node name if not found. + */ + info = &state->parts[slot].info; + partname = of_get_property(np, "label", &len); + if (!partname) + partname = of_get_property(np, "name", &len); + strscpy(info->volname, partname, sizeof(info->volname)); + + snprintf(tmp, sizeof(tmp), "(%s)", info->volname); + strlcat(state->pp_buf, tmp, PAGE_SIZE); +} + +int of_partition(struct parsed_partitions *state) +{ + struct device *ddev = disk_to_dev(state->disk); + struct device_node *np; + int slot; + + struct device_node *partitions_np = of_node_get(ddev->of_node); + + if (!partitions_np || + !of_device_is_compatible(partitions_np, "fixed-partitions")) + return 0; + + slot = 1; + /* Validate parition offset and size */ + for_each_child_of_node(partitions_np, np) { + if (validate_of_partition(np, slot)) { + of_node_put(np); + of_node_put(partitions_np); + + return -1; + } + + slot++; + } + + slot = 1; + for_each_child_of_node(partitions_np, np) { + if (slot >= state->limit) { + of_node_put(np); + break; + } + + add_of_partition(state, slot, np); + + slot++; + } + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + return 1; +} diff --git a/block/sed-opal.c b/block/sed-opal.c index 598fd3e7fcc8..5a28f23f7f22 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -3037,6 +3037,29 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) return ret; } +static int opal_set_new_sid_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) +{ + int ret; + struct opal_key *newkey = &opal_pw->new_user_pw.opal_key; + struct opal_key *oldkey = &opal_pw->session.opal_key; + + const struct opal_step pw_steps[] = { + { start_SIDASP_opal_session, oldkey }, + { set_sid_cpin_pin, newkey }, + { end_opal_session, } + }; + + if (!dev) + return -ENODEV; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_activate_user(struct opal_dev *dev, struct opal_session_info *opal_session) { @@ -3286,6 +3309,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_DISCOVERY: ret = opal_get_discv(dev, p); break; + case IOC_OPAL_SET_SID_PW: + ret = opal_set_new_sid_pw(dev, p); + break; default: break; |