diff options
Diffstat (limited to 'fs')
191 files changed, 7763 insertions, 4694 deletions
diff --git a/fs/afs/proc.c b/fs/afs/proc.c index bddc5120ed40..24a905b076fd 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -130,8 +130,8 @@ int afs_proc_init(void) if (!proc_afs) goto error_dir; - if (!proc_create("cells", 0, proc_afs, &afs_proc_cells_fops) || - !proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops)) + if (!proc_create("cells", 0644, proc_afs, &afs_proc_cells_fops) || + !proc_create("rootcell", 0644, proc_afs, &afs_proc_rootcell_fops)) goto error_tree; _leave(" = 0"); diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index fc60b31453ee..4f70f383132c 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -114,6 +114,14 @@ void bio_integrity_free(struct bio *bio) } EXPORT_SYMBOL(bio_integrity_free); +static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip) +{ + if (bip->bip_slab == BIO_POOL_NONE) + return BIP_INLINE_VECS; + + return bvec_nr_vecs(bip->bip_slab); +} + /** * bio_integrity_add_page - Attach integrity metadata * @bio: bio to update @@ -129,13 +137,12 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, struct bio_integrity_payload *bip = bio->bi_integrity; struct bio_vec *iv; - if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) { + if (bip->bip_vcnt >= bip_integrity_vecs(bip)) { printk(KERN_ERR "%s: bip_vec full\n", __func__); return 0; } - iv = bip_vec_idx(bip, bip->bip_vcnt); - BUG_ON(iv == NULL); + iv = bip->bip_vec + bip->bip_vcnt; iv->bv_page = page; iv->bv_len = len; @@ -203,6 +210,12 @@ static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, return sectors; } +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size; +} + /** * bio_integrity_tag_size - Retrieve integrity tag space * @bio: bio to inspect @@ -215,13 +228,14 @@ unsigned int bio_integrity_tag_size(struct bio *bio) { struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - BUG_ON(bio->bi_size == 0); + BUG_ON(bio->bi_iter.bi_size == 0); - return bi->tag_size * (bio->bi_size / bi->sector_size); + return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size); } EXPORT_SYMBOL(bio_integrity_tag_size); -int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) +static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, + int set) { struct bio_integrity_payload *bip = bio->bi_integrity; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); @@ -235,9 +249,9 @@ int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) nr_sectors = bio_integrity_hw_sectors(bi, DIV_ROUND_UP(len, bi->tag_size)); - if (nr_sectors * bi->tuple_size > bip->bip_size) { - printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", - __func__, nr_sectors * bi->tuple_size, bip->bip_size); + if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) { + printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__, + nr_sectors * bi->tuple_size, bip->bip_iter.bi_size); return -1; } @@ -299,29 +313,30 @@ static void bio_integrity_generate(struct bio *bio) { struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); struct blk_integrity_exchg bix; - struct bio_vec *bv; - sector_t sector = bio->bi_sector; - unsigned int i, sectors, total; + struct bio_vec bv; + struct bvec_iter iter; + sector_t sector = bio->bi_iter.bi_sector; + unsigned int sectors, total; void *prot_buf = bio->bi_integrity->bip_buf; total = 0; bix.disk_name = bio->bi_bdev->bd_disk->disk_name; bix.sector_size = bi->sector_size; - bio_for_each_segment(bv, bio, i) { - void *kaddr = kmap_atomic(bv->bv_page); - bix.data_buf = kaddr + bv->bv_offset; - bix.data_size = bv->bv_len; + bio_for_each_segment(bv, bio, iter) { + void *kaddr = kmap_atomic(bv.bv_page); + bix.data_buf = kaddr + bv.bv_offset; + bix.data_size = bv.bv_len; bix.prot_buf = prot_buf; bix.sector = sector; bi->generate_fn(&bix); - sectors = bv->bv_len / bi->sector_size; + sectors = bv.bv_len / bi->sector_size; sector += sectors; prot_buf += sectors * bi->tuple_size; total += sectors * bi->tuple_size; - BUG_ON(total > bio->bi_integrity->bip_size); + BUG_ON(total > bio->bi_integrity->bip_iter.bi_size); kunmap_atomic(kaddr); } @@ -386,8 +401,8 @@ int bio_integrity_prep(struct bio *bio) bip->bip_owns_buf = 1; bip->bip_buf = buf; - bip->bip_size = len; - bip->bip_sector = bio->bi_sector; + bip->bip_iter.bi_size = len; + bip->bip_iter.bi_sector = bio->bi_iter.bi_sector; /* Map it */ offset = offset_in_page(buf); @@ -442,16 +457,17 @@ static int bio_integrity_verify(struct bio *bio) struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); struct blk_integrity_exchg bix; struct bio_vec *bv; - sector_t sector = bio->bi_integrity->bip_sector; - unsigned int i, sectors, total, ret; + sector_t sector = bio->bi_integrity->bip_iter.bi_sector; + unsigned int sectors, ret = 0; void *prot_buf = bio->bi_integrity->bip_buf; + int i; - ret = total = 0; bix.disk_name = bio->bi_bdev->bd_disk->disk_name; bix.sector_size = bi->sector_size; - bio_for_each_segment(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i) { void *kaddr = kmap_atomic(bv->bv_page); + bix.data_buf = kaddr + bv->bv_offset; bix.data_size = bv->bv_len; bix.prot_buf = prot_buf; @@ -467,8 +483,6 @@ static int bio_integrity_verify(struct bio *bio) sectors = bv->bv_len / bi->sector_size; sector += sectors; prot_buf += sectors * bi->tuple_size; - total += sectors * bi->tuple_size; - BUG_ON(total > bio->bi_integrity->bip_size); kunmap_atomic(kaddr); } @@ -495,7 +509,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; - bio_endio(bio, error); + bio_endio_nodec(bio, error); } /** @@ -533,56 +547,6 @@ void bio_integrity_endio(struct bio *bio, int error) EXPORT_SYMBOL(bio_integrity_endio); /** - * bio_integrity_mark_head - Advance bip_vec skip bytes - * @bip: Integrity vector to advance - * @skip: Number of bytes to advance it - */ -void bio_integrity_mark_head(struct bio_integrity_payload *bip, - unsigned int skip) -{ - struct bio_vec *iv; - unsigned int i; - - bip_for_each_vec(iv, bip, i) { - if (skip == 0) { - bip->bip_idx = i; - return; - } else if (skip >= iv->bv_len) { - skip -= iv->bv_len; - } else { /* skip < iv->bv_len) */ - iv->bv_offset += skip; - iv->bv_len -= skip; - bip->bip_idx = i; - return; - } - } -} - -/** - * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long - * @bip: Integrity vector to truncate - * @len: New length of integrity vector - */ -void bio_integrity_mark_tail(struct bio_integrity_payload *bip, - unsigned int len) -{ - struct bio_vec *iv; - unsigned int i; - - bip_for_each_vec(iv, bip, i) { - if (len == 0) { - bip->bip_vcnt = i; - return; - } else if (len >= iv->bv_len) { - len -= iv->bv_len; - } else { /* len < iv->bv_len) */ - iv->bv_len = len; - len = 0; - } - } -} - -/** * bio_integrity_advance - Advance integrity vector * @bio: bio whose integrity vector to update * @bytes_done: number of data bytes that have been completed @@ -595,13 +559,9 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { struct bio_integrity_payload *bip = bio->bi_integrity; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - unsigned int nr_sectors; - - BUG_ON(bip == NULL); - BUG_ON(bi == NULL); + unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); - nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9); - bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size); + bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); } EXPORT_SYMBOL(bio_integrity_advance); @@ -621,64 +581,13 @@ void bio_integrity_trim(struct bio *bio, unsigned int offset, { struct bio_integrity_payload *bip = bio->bi_integrity; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - unsigned int nr_sectors; - - BUG_ON(bip == NULL); - BUG_ON(bi == NULL); - BUG_ON(!bio_flagged(bio, BIO_CLONED)); - nr_sectors = bio_integrity_hw_sectors(bi, sectors); - bip->bip_sector = bip->bip_sector + offset; - bio_integrity_mark_head(bip, offset * bi->tuple_size); - bio_integrity_mark_tail(bip, sectors * bi->tuple_size); + bio_integrity_advance(bio, offset << 9); + bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors); } EXPORT_SYMBOL(bio_integrity_trim); /** - * bio_integrity_split - Split integrity metadata - * @bio: Protected bio - * @bp: Resulting bio_pair - * @sectors: Offset - * - * Description: Splits an integrity page into a bio_pair. - */ -void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) -{ - struct blk_integrity *bi; - struct bio_integrity_payload *bip = bio->bi_integrity; - unsigned int nr_sectors; - - if (bio_integrity(bio) == 0) - return; - - bi = bdev_get_integrity(bio->bi_bdev); - BUG_ON(bi == NULL); - BUG_ON(bip->bip_vcnt != 1); - - nr_sectors = bio_integrity_hw_sectors(bi, sectors); - - bp->bio1.bi_integrity = &bp->bip1; - bp->bio2.bi_integrity = &bp->bip2; - - bp->iv1 = bip->bip_vec[bip->bip_idx]; - bp->iv2 = bip->bip_vec[bip->bip_idx]; - - bp->bip1.bip_vec = &bp->iv1; - bp->bip2.bip_vec = &bp->iv2; - - bp->iv1.bv_len = sectors * bi->tuple_size; - bp->iv2.bv_offset += sectors * bi->tuple_size; - bp->iv2.bv_len -= sectors * bi->tuple_size; - - bp->bip1.bip_sector = bio->bi_integrity->bip_sector; - bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors; - - bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1; - bp->bip1.bip_idx = bp->bip2.bip_idx = 0; -} -EXPORT_SYMBOL(bio_integrity_split); - -/** * bio_integrity_clone - Callback for cloning bios with integrity metadata * @bio: New bio * @bio_src: Original bio @@ -702,9 +611,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, memcpy(bip->bip_vec, bip_src->bip_vec, bip_src->bip_vcnt * sizeof(struct bio_vec)); - bip->bip_sector = bip_src->bip_sector; bip->bip_vcnt = bip_src->bip_vcnt; - bip->bip_idx = bip_src->bip_idx; + bip->bip_iter = bip_src->bip_iter; return 0; } @@ -38,8 +38,6 @@ */ #define BIO_INLINE_VECS 4 -static mempool_t *bio_split_pool __read_mostly; - /* * if you change this list, also change bvec_alloc or things will * break badly! cannot be bigger than what you can fit into an @@ -273,6 +271,7 @@ void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; + atomic_set(&bio->bi_remaining, 1); atomic_set(&bio->bi_cnt, 1); } EXPORT_SYMBOL(bio_init); @@ -295,9 +294,35 @@ void bio_reset(struct bio *bio) memset(bio, 0, BIO_RESET_BYTES); bio->bi_flags = flags|(1 << BIO_UPTODATE); + atomic_set(&bio->bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); +static void bio_chain_endio(struct bio *bio, int error) +{ + bio_endio(bio->bi_private, error); + bio_put(bio); +} + +/** + * bio_chain - chain bio completions + * + * The caller won't have a bi_end_io called when @bio completes - instead, + * @parent's bi_end_io won't be called until both @parent and @bio have + * completed; the chained bio will also be freed when it completes. + * + * The caller must not set bi_private or bi_end_io in @bio. + */ +void bio_chain(struct bio *bio, struct bio *parent) +{ + BUG_ON(bio->bi_private || bio->bi_end_io); + + bio->bi_private = parent; + bio->bi_end_io = bio_chain_endio; + atomic_inc(&parent->bi_remaining); +} +EXPORT_SYMBOL(bio_chain); + static void bio_alloc_rescue(struct work_struct *work) { struct bio_set *bs = container_of(work, struct bio_set, rescue_work); @@ -473,13 +498,13 @@ EXPORT_SYMBOL(bio_alloc_bioset); void zero_fill_bio(struct bio *bio) { unsigned long flags; - struct bio_vec *bv; - int i; + struct bio_vec bv; + struct bvec_iter iter; - bio_for_each_segment(bv, bio, i) { - char *data = bvec_kmap_irq(bv, &flags); - memset(data, 0, bv->bv_len); - flush_dcache_page(bv->bv_page); + bio_for_each_segment(bv, bio, iter) { + char *data = bvec_kmap_irq(&bv, &flags); + memset(data, 0, bv.bv_len); + flush_dcache_page(bv.bv_page); bvec_kunmap_irq(data, &flags); } } @@ -515,51 +540,49 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio) EXPORT_SYMBOL(bio_phys_segments); /** - * __bio_clone - clone a bio + * __bio_clone_fast - clone a bio that shares the original bio's biovec * @bio: destination bio * @bio_src: bio to clone * * Clone a &bio. Caller will own the returned bio, but not * the actual data it points to. Reference count of returned * bio will be one. + * + * Caller must ensure that @bio_src is not freed before @bio. */ -void __bio_clone(struct bio *bio, struct bio *bio_src) +void __bio_clone_fast(struct bio *bio, struct bio *bio_src) { - memcpy(bio->bi_io_vec, bio_src->bi_io_vec, - bio_src->bi_max_vecs * sizeof(struct bio_vec)); + BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE); /* * most users will be overriding ->bi_bdev with a new target, * so we don't set nor calculate new physical/hw segment counts here */ - bio->bi_sector = bio_src->bi_sector; bio->bi_bdev = bio_src->bi_bdev; bio->bi_flags |= 1 << BIO_CLONED; bio->bi_rw = bio_src->bi_rw; - bio->bi_vcnt = bio_src->bi_vcnt; - bio->bi_size = bio_src->bi_size; - bio->bi_idx = bio_src->bi_idx; + bio->bi_iter = bio_src->bi_iter; + bio->bi_io_vec = bio_src->bi_io_vec; } -EXPORT_SYMBOL(__bio_clone); +EXPORT_SYMBOL(__bio_clone_fast); /** - * bio_clone_bioset - clone a bio + * bio_clone_fast - clone a bio that shares the original bio's biovec * @bio: bio to clone * @gfp_mask: allocation priority * @bs: bio_set to allocate from * - * Like __bio_clone, only also allocates the returned bio + * Like __bio_clone_fast, only also allocates the returned bio */ -struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask, - struct bio_set *bs) +struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) { struct bio *b; - b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs); + b = bio_alloc_bioset(gfp_mask, 0, bs); if (!b) return NULL; - __bio_clone(b, bio); + __bio_clone_fast(b, bio); if (bio_integrity(bio)) { int ret; @@ -574,6 +597,79 @@ struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask, return b; } +EXPORT_SYMBOL(bio_clone_fast); + +/** + * bio_clone_bioset - clone a bio + * @bio_src: bio to clone + * @gfp_mask: allocation priority + * @bs: bio_set to allocate from + * + * Clone bio. Caller will own the returned bio, but not the actual data it + * points to. Reference count of returned bio will be one. + */ +struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, + struct bio_set *bs) +{ + struct bvec_iter iter; + struct bio_vec bv; + struct bio *bio; + + /* + * Pre immutable biovecs, __bio_clone() used to just do a memcpy from + * bio_src->bi_io_vec to bio->bi_io_vec. + * + * We can't do that anymore, because: + * + * - The point of cloning the biovec is to produce a bio with a biovec + * the caller can modify: bi_idx and bi_bvec_done should be 0. + * + * - The original bio could've had more than BIO_MAX_PAGES biovecs; if + * we tried to clone the whole thing bio_alloc_bioset() would fail. + * But the clone should succeed as long as the number of biovecs we + * actually need to allocate is fewer than BIO_MAX_PAGES. + * + * - Lastly, bi_vcnt should not be looked at or relied upon by code + * that does not own the bio - reason being drivers don't use it for + * iterating over the biovec anymore, so expecting it to be kept up + * to date (i.e. for clones that share the parent biovec) is just + * asking for trouble and would force extra work on + * __bio_clone_fast() anyways. + */ + + bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); + if (!bio) + return NULL; + + bio->bi_bdev = bio_src->bi_bdev; + bio->bi_rw = bio_src->bi_rw; + bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; + bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + + if (bio->bi_rw & REQ_DISCARD) + goto integrity_clone; + + if (bio->bi_rw & REQ_WRITE_SAME) { + bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; + goto integrity_clone; + } + + bio_for_each_segment(bv, bio_src, iter) + bio->bi_io_vec[bio->bi_vcnt++] = bv; + +integrity_clone: + if (bio_integrity(bio_src)) { + int ret; + + ret = bio_integrity_clone(bio, bio_src, gfp_mask); + if (ret < 0) { + bio_put(bio); + return NULL; + } + } + + return bio; +} EXPORT_SYMBOL(bio_clone_bioset); /** @@ -612,7 +708,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (unlikely(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_size + len) >> 9) > max_sectors) + if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) return 0; /* @@ -635,8 +731,9 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page simulate merging updated prev_bvec as new bvec. */ .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_sector, - .bi_size = bio->bi_size - prev_bv_len, + .bi_sector = bio->bi_iter.bi_sector, + .bi_size = bio->bi_iter.bi_size - + prev_bv_len, .bi_rw = bio->bi_rw, }; @@ -684,8 +781,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (q->merge_bvec_fn) { struct bvec_merge_data bvm = { .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_sector, - .bi_size = bio->bi_size, + .bi_sector = bio->bi_iter.bi_sector, + .bi_size = bio->bi_iter.bi_size, .bi_rw = bio->bi_rw, }; @@ -708,7 +805,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page bio->bi_vcnt++; bio->bi_phys_segments++; done: - bio->bi_size += len; + bio->bi_iter.bi_size += len; return len; } @@ -807,28 +904,7 @@ void bio_advance(struct bio *bio, unsigned bytes) if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); - bio->bi_sector += bytes >> 9; - bio->bi_size -= bytes; - - if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK) - return; - - while (bytes) { - if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { - WARN_ONCE(1, "bio idx %d >= vcnt %d\n", - bio->bi_idx, bio->bi_vcnt); - break; - } - - if (bytes >= bio_iovec(bio)->bv_len) { - bytes -= bio_iovec(bio)->bv_len; - bio->bi_idx++; - } else { - bio_iovec(bio)->bv_len -= bytes; - bio_iovec(bio)->bv_offset += bytes; - bytes = 0; - } - } + bio_advance_iter(bio, &bio->bi_iter, bytes); } EXPORT_SYMBOL(bio_advance); @@ -874,117 +950,80 @@ EXPORT_SYMBOL(bio_alloc_pages); */ void bio_copy_data(struct bio *dst, struct bio *src) { - struct bio_vec *src_bv, *dst_bv; - unsigned src_offset, dst_offset, bytes; + struct bvec_iter src_iter, dst_iter; + struct bio_vec src_bv, dst_bv; void *src_p, *dst_p; + unsigned bytes; - src_bv = bio_iovec(src); - dst_bv = bio_iovec(dst); - - src_offset = src_bv->bv_offset; - dst_offset = dst_bv->bv_offset; + src_iter = src->bi_iter; + dst_iter = dst->bi_iter; while (1) { - if (src_offset == src_bv->bv_offset + src_bv->bv_len) { - src_bv++; - if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) { - src = src->bi_next; - if (!src) - break; - - src_bv = bio_iovec(src); - } + if (!src_iter.bi_size) { + src = src->bi_next; + if (!src) + break; - src_offset = src_bv->bv_offset; + src_iter = src->bi_iter; } - if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) { - dst_bv++; - if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) { - dst = dst->bi_next; - if (!dst) - break; - - dst_bv = bio_iovec(dst); - } + if (!dst_iter.bi_size) { + dst = dst->bi_next; + if (!dst) + break; - dst_offset = dst_bv->bv_offset; + dst_iter = dst->bi_iter; } - bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset, - src_bv->bv_offset + src_bv->bv_len - src_offset); + src_bv = bio_iter_iovec(src, src_iter); + dst_bv = bio_iter_iovec(dst, dst_iter); - src_p = kmap_atomic(src_bv->bv_page); - dst_p = kmap_atomic(dst_bv->bv_page); + bytes = min(src_bv.bv_len, dst_bv.bv_len); - memcpy(dst_p + dst_offset, - src_p + src_offset, + src_p = kmap_atomic(src_bv.bv_page); + dst_p = kmap_atomic(dst_bv.bv_page); + + memcpy(dst_p + dst_bv.bv_offset, + src_p + src_bv.bv_offset, bytes); kunmap_atomic(dst_p); kunmap_atomic(src_p); - src_offset += bytes; - dst_offset += bytes; + bio_advance_iter(src, &src_iter, bytes); + bio_advance_iter(dst, &dst_iter, bytes); } } EXPORT_SYMBOL(bio_copy_data); struct bio_map_data { - struct bio_vec *iovecs; - struct sg_iovec *sgvecs; int nr_sgvecs; int is_our_pages; + struct sg_iovec sgvecs[]; }; static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, struct sg_iovec *iov, int iov_count, int is_our_pages) { - memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); bmd->nr_sgvecs = iov_count; bmd->is_our_pages = is_our_pages; bio->bi_private = bmd; } -static void bio_free_map_data(struct bio_map_data *bmd) -{ - kfree(bmd->iovecs); - kfree(bmd->sgvecs); - kfree(bmd); -} - static struct bio_map_data *bio_alloc_map_data(int nr_segs, unsigned int iov_count, gfp_t gfp_mask) { - struct bio_map_data *bmd; - if (iov_count > UIO_MAXIOV) return NULL; - bmd = kmalloc(sizeof(*bmd), gfp_mask); - if (!bmd) - return NULL; - - bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask); - if (!bmd->iovecs) { - kfree(bmd); - return NULL; - } - - bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask); - if (bmd->sgvecs) - return bmd; - - kfree(bmd->iovecs); - kfree(bmd); - return NULL; + return kmalloc(sizeof(struct bio_map_data) + + sizeof(struct sg_iovec) * iov_count, gfp_mask); } -static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, - struct sg_iovec *iov, int iov_count, +static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, int to_user, int from_user, int do_free_page) { int ret = 0, i; @@ -994,7 +1033,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, bio_for_each_segment_all(bvec, bio, i) { char *bv_addr = page_address(bvec->bv_page); - unsigned int bv_len = iovecs[i].bv_len; + unsigned int bv_len = bvec->bv_len; while (bv_len && iov_idx < iov_count) { unsigned int bytes; @@ -1054,14 +1093,14 @@ int bio_uncopy_user(struct bio *bio) * don't copy into a random user address space, just free. */ if (current->mm) - ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, - bmd->nr_sgvecs, bio_data_dir(bio) == READ, + ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, + bio_data_dir(bio) == READ, 0, bmd->is_our_pages); else if (bmd->is_our_pages) bio_for_each_segment_all(bvec, bio, i) __free_page(bvec->bv_page); } - bio_free_map_data(bmd); + kfree(bmd); bio_put(bio); return ret; } @@ -1175,7 +1214,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, */ if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || (map_data && map_data->from_user)) { - ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0); + ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0); if (ret) goto cleanup; } @@ -1189,7 +1228,7 @@ cleanup: bio_put(bio); out_bmd: - bio_free_map_data(bmd); + kfree(bmd); return ERR_PTR(ret); } @@ -1485,7 +1524,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, if (IS_ERR(bio)) return bio; - if (bio->bi_size == len) + if (bio->bi_iter.bi_size == len) return bio; /* @@ -1506,16 +1545,15 @@ static void bio_copy_kern_endio(struct bio *bio, int err) bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec->bv_page); - int len = bmd->iovecs[i].bv_len; if (read) - memcpy(p, addr, len); + memcpy(p, addr, bvec->bv_len); __free_page(bvec->bv_page); - p += len; + p += bvec->bv_len; } - bio_free_map_data(bmd); + kfree(bmd); bio_put(bio); } @@ -1686,11 +1724,11 @@ void bio_check_pages_dirty(struct bio *bio) #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE void bio_flush_dcache_pages(struct bio *bi) { - int i; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; - bio_for_each_segment(bvec, bi, i) - flush_dcache_page(bvec->bv_page); + bio_for_each_segment(bvec, bi, iter) + flush_dcache_page(bvec.bv_page); } EXPORT_SYMBOL(bio_flush_dcache_pages); #endif @@ -1711,96 +1749,86 @@ EXPORT_SYMBOL(bio_flush_dcache_pages); **/ void bio_endio(struct bio *bio, int error) { - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; + while (bio) { + BUG_ON(atomic_read(&bio->bi_remaining) <= 0); - if (bio->bi_end_io) - bio->bi_end_io(bio, error); -} -EXPORT_SYMBOL(bio_endio); + if (error) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = -EIO; -void bio_pair_release(struct bio_pair *bp) -{ - if (atomic_dec_and_test(&bp->cnt)) { - struct bio *master = bp->bio1.bi_private; + if (!atomic_dec_and_test(&bio->bi_remaining)) + return; - bio_endio(master, bp->error); - mempool_free(bp, bp->bio2.bi_private); + /* + * Need to have a real endio function for chained bios, + * otherwise various corner cases will break (like stacking + * block devices that save/restore bi_end_io) - however, we want + * to avoid unbounded recursion and blowing the stack. Tail call + * optimization would handle this, but compiling with frame + * pointers also disables gcc's sibling call optimization. + */ + if (bio->bi_end_io == bio_chain_endio) { + struct bio *parent = bio->bi_private; + bio_put(bio); + bio = parent; + } else { + if (bio->bi_end_io) + bio->bi_end_io(bio, error); + bio = NULL; + } } } -EXPORT_SYMBOL(bio_pair_release); - -static void bio_pair_end_1(struct bio *bi, int err) -{ - struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); - - if (err) - bp->error = err; - - bio_pair_release(bp); -} +EXPORT_SYMBOL(bio_endio); -static void bio_pair_end_2(struct bio *bi, int err) +/** + * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining + * @bio: bio + * @error: error, if any + * + * For code that has saved and restored bi_end_io; thing hard before using this + * function, probably you should've cloned the entire bio. + **/ +void bio_endio_nodec(struct bio *bio, int error) { - struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); - - if (err) - bp->error = err; - - bio_pair_release(bp); + atomic_inc(&bio->bi_remaining); + bio_endio(bio, error); } +EXPORT_SYMBOL(bio_endio_nodec); -/* - * split a bio - only worry about a bio with a single page in its iovec +/** + * bio_split - split a bio + * @bio: bio to split + * @sectors: number of sectors to split from the front of @bio + * @gfp: gfp mask + * @bs: bio set to allocate from + * + * Allocates and returns a new bio which represents @sectors from the start of + * @bio, and updates @bio to represent the remaining sectors. + * + * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's + * responsibility to ensure that @bio is not freed before the split. */ -struct bio_pair *bio_split(struct bio *bi, int first_sectors) +struct bio *bio_split(struct bio *bio, int sectors, + gfp_t gfp, struct bio_set *bs) { - struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO); - - if (!bp) - return bp; - - trace_block_split(bdev_get_queue(bi->bi_bdev), bi, - bi->bi_sector + first_sectors); - - BUG_ON(bio_segments(bi) > 1); - atomic_set(&bp->cnt, 3); - bp->error = 0; - bp->bio1 = *bi; - bp->bio2 = *bi; - bp->bio2.bi_sector += first_sectors; - bp->bio2.bi_size -= first_sectors << 9; - bp->bio1.bi_size = first_sectors << 9; - - if (bi->bi_vcnt != 0) { - bp->bv1 = *bio_iovec(bi); - bp->bv2 = *bio_iovec(bi); - - if (bio_is_rw(bi)) { - bp->bv2.bv_offset += first_sectors << 9; - bp->bv2.bv_len -= first_sectors << 9; - bp->bv1.bv_len = first_sectors << 9; - } + struct bio *split = NULL; - bp->bio1.bi_io_vec = &bp->bv1; - bp->bio2.bi_io_vec = &bp->bv2; + BUG_ON(sectors <= 0); + BUG_ON(sectors >= bio_sectors(bio)); - bp->bio1.bi_max_vecs = 1; - bp->bio2.bi_max_vecs = 1; - } + split = bio_clone_fast(bio, gfp, bs); + if (!split) + return NULL; - bp->bio1.bi_end_io = bio_pair_end_1; - bp->bio2.bi_end_io = bio_pair_end_2; + split->bi_iter.bi_size = sectors << 9; - bp->bio1.bi_private = bi; - bp->bio2.bi_private = bio_split_pool; + if (bio_integrity(split)) + bio_integrity_trim(split, 0, sectors); - if (bio_integrity(bi)) - bio_integrity_split(bi, bp, first_sectors); + bio_advance(bio, split->bi_iter.bi_size); - return bp; + return split; } EXPORT_SYMBOL(bio_split); @@ -1814,80 +1842,20 @@ void bio_trim(struct bio *bio, int offset, int size) { /* 'bio' is a cloned bio which we need to trim to match * the given offset and size. - * This requires adjusting bi_sector, bi_size, and bi_io_vec */ - int i; - struct bio_vec *bvec; - int sofar = 0; size <<= 9; - if (offset == 0 && size == bio->bi_size) + if (offset == 0 && size == bio->bi_iter.bi_size) return; clear_bit(BIO_SEG_VALID, &bio->bi_flags); bio_advance(bio, offset << 9); - bio->bi_size = size; - - /* avoid any complications with bi_idx being non-zero*/ - if (bio->bi_idx) { - memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, - (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); - bio->bi_vcnt -= bio->bi_idx; - bio->bi_idx = 0; - } - /* Make sure vcnt and last bv are not too big */ - bio_for_each_segment(bvec, bio, i) { - if (sofar + bvec->bv_len > size) - bvec->bv_len = size - sofar; - if (bvec->bv_len == 0) { - bio->bi_vcnt = i; - break; - } - sofar += bvec->bv_len; - } + bio->bi_iter.bi_size = size; } EXPORT_SYMBOL_GPL(bio_trim); -/** - * bio_sector_offset - Find hardware sector offset in bio - * @bio: bio to inspect - * @index: bio_vec index - * @offset: offset in bv_page - * - * Return the number of hardware sectors between beginning of bio - * and an end point indicated by a bio_vec index and an offset - * within that vector's page. - */ -sector_t bio_sector_offset(struct bio *bio, unsigned short index, - unsigned int offset) -{ - unsigned int sector_sz; - struct bio_vec *bv; - sector_t sectors; - int i; - - sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue); - sectors = 0; - - if (index >= bio->bi_idx) - index = bio->bi_vcnt - 1; - - bio_for_each_segment_all(bv, bio, i) { - if (i == index) { - if (offset > bv->bv_offset) - sectors += (offset - bv->bv_offset) / sector_sz; - break; - } - - sectors += bv->bv_len / sector_sz; - } - - return sectors; -} -EXPORT_SYMBOL(bio_sector_offset); - /* * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. @@ -2065,11 +2033,6 @@ static int __init init_bio(void) if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) panic("bio: can't create integrity pool\n"); - bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES, - sizeof(struct bio_pair)); - if (!bio_split_pool) - panic("bio: can't create split pool\n"); - return 0; } subsys_initcall(init_bio); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index aa976eced2d2..a66768ebc8d1 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,6 +1,7 @@ config BTRFS_FS tristate "Btrfs filesystem support" - select LIBCRC32C + select CRYPTO + select CRYPTO_CRC32C select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 1a44e42d602a..f341a98031d2 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o + uuid-tree.o props.o hash.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 3775947429b2..aded3ef3d3d4 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -66,6 +66,16 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, return 0; } +static void free_inode_elem_list(struct extent_inode_elem *eie) +{ + struct extent_inode_elem *eie_next; + + for (; eie; eie = eie_next) { + eie_next = eie->next; + kfree(eie); + } +} + static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, u64 extent_item_pos, struct extent_inode_elem **eie) @@ -209,18 +219,19 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, } static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, int level, - struct btrfs_key *key_for_search, u64 time_seq, - u64 wanted_disk_byte, - const u64 *extent_item_pos) + struct ulist *parents, struct __prelim_ref *ref, + int level, u64 time_seq, const u64 *extent_item_pos) { int ret = 0; int slot; struct extent_buffer *eb; struct btrfs_key key; + struct btrfs_key *key_for_search = &ref->key_for_search; struct btrfs_file_extent_item *fi; struct extent_inode_elem *eie = NULL, *old = NULL; u64 disk_byte; + u64 wanted_disk_byte = ref->wanted_disk_byte; + u64 count = 0; if (level != 0) { eb = path->nodes[level]; @@ -238,7 +249,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) ret = btrfs_next_old_leaf(root, path, time_seq); - while (!ret) { + while (!ret && count < ref->count) { eb = path->nodes[0]; slot = path->slots[0]; @@ -254,6 +265,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (disk_byte == wanted_disk_byte) { eie = NULL; old = NULL; + count++; if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, @@ -273,6 +285,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, old = old->next; old->next = eie; } + eie = NULL; } next: ret = btrfs_next_old_item(root, path, time_seq); @@ -280,6 +293,8 @@ next: if (ret > 0) ret = 0; + else if (ret < 0) + free_inode_elem_list(eie); return ret; } @@ -299,23 +314,34 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, int ret = 0; int root_level; int level = ref->level; + int index; root_key.objectid = ref->root_id; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + root = btrfs_read_fs_root_no_name(fs_info, &root_key); if (IS_ERR(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(root); goto out; } root_level = btrfs_old_root_level(root, time_seq); - if (root_level + 1 == level) + if (root_level + 1 == level) { + srcu_read_unlock(&fs_info->subvol_srcu, index); goto out; + } path->lowest_level = level; ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); + + /* root node has been locked, we can release @subvol_srcu safely here */ + srcu_read_unlock(&fs_info->subvol_srcu, index); + pr_debug("search slot in root %llu (level %d, ref count %d) returned " "%d for key (%llu %u %llu)\n", ref->root_id, level, ref->count, ret, @@ -334,9 +360,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; } - ret = add_all_parents(root, path, parents, level, &ref->key_for_search, - time_seq, ref->wanted_disk_byte, - extent_item_pos); + ret = add_all_parents(root, path, parents, ref, level, time_seq, + extent_item_pos); out: path->lowest_level = 0; btrfs_release_path(path); @@ -376,10 +401,16 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; err = __resolve_indirect_ref(fs_info, path, time_seq, ref, parents, extent_item_pos); - if (err == -ENOMEM) - goto out; - if (err) + /* + * we can only tolerate ENOENT,otherwise,we should catch error + * and return directly. + */ + if (err == -ENOENT) { continue; + } else if (err) { + ret = err; + goto out; + } /* we put the first parent into the ref at hand */ ULIST_ITER_INIT(&uiter); @@ -538,14 +569,13 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, if (extent_op && extent_op->update_key) btrfs_disk_key_to_cpu(&op_key, &extent_op->key); - while ((n = rb_prev(n))) { + spin_lock(&head->lock); + n = rb_first(&head->ref_root); + while (n) { struct btrfs_delayed_ref_node *node; node = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - if (node->bytenr != head->node.bytenr) - break; - WARN_ON(node->is_head); - + n = rb_next(n); if (node->seq > seq) continue; @@ -612,10 +642,10 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, WARN_ON(1); } if (ret) - return ret; + break; } - - return 0; + spin_unlock(&head->lock); + return ret; } /* @@ -828,6 +858,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct list_head prefs_delayed; struct list_head prefs; struct __prelim_ref *ref; + struct extent_inode_elem *eie = NULL; INIT_LIST_HEAD(&prefs); INIT_LIST_HEAD(&prefs_delayed); @@ -882,15 +913,15 @@ again: btrfs_put_delayed_ref(&head->node); goto again; } + spin_unlock(&delayed_refs->lock); ret = __add_delayed_refs(head, time_seq, &prefs_delayed); mutex_unlock(&head->mutex); - if (ret) { - spin_unlock(&delayed_refs->lock); + if (ret) goto out; - } + } else { + spin_unlock(&delayed_refs->lock); } - spin_unlock(&delayed_refs->lock); } if (path->slots[0]) { @@ -941,7 +972,6 @@ again: goto out; } if (ref->count && ref->parent) { - struct extent_inode_elem *eie = NULL; if (extent_item_pos && !ref->inode_list) { u32 bsz; struct extent_buffer *eb; @@ -976,6 +1006,7 @@ again: eie = eie->next; eie->next = ref->inode_list; } + eie = NULL; } list_del(&ref->list); kmem_cache_free(btrfs_prelim_ref_cache, ref); @@ -994,7 +1025,8 @@ out: list_del(&ref->list); kmem_cache_free(btrfs_prelim_ref_cache, ref); } - + if (ret < 0) + free_inode_elem_list(eie); return ret; } @@ -1002,7 +1034,6 @@ static void free_leaf_list(struct ulist *blocks) { struct ulist_node *node = NULL; struct extent_inode_elem *eie; - struct extent_inode_elem *eie_next; struct ulist_iterator uiter; ULIST_ITER_INIT(&uiter); @@ -1010,10 +1041,7 @@ static void free_leaf_list(struct ulist *blocks) if (!node->aux) continue; eie = (struct extent_inode_elem *)(uintptr_t)node->aux; - for (; eie; eie = eie_next) { - eie_next = eie->next; - kfree(eie); - } + free_inode_elem_list(eie); node->aux = 0; } @@ -1101,44 +1129,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, if (!node) break; bytenr = node->val; + cond_resched(); } ulist_free(tmp); return 0; } - -static int __inode_info(u64 inum, u64 ioff, u8 key_type, - struct btrfs_root *fs_root, struct btrfs_path *path, - struct btrfs_key *found_key) -{ - int ret; - struct btrfs_key key; - struct extent_buffer *eb; - - key.type = key_type; - key.objectid = inum; - key.offset = ioff; - - ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); - if (ret < 0) - return ret; - - eb = path->nodes[0]; - if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(fs_root, path); - if (ret) - return ret; - eb = path->nodes[0]; - } - - btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); - if (found_key->type != key.type || found_key->objectid != key.objectid) - return 1; - - return 0; -} - /* * this makes the path point to (inum INODE_ITEM ioff) */ @@ -1146,16 +1143,16 @@ int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, struct btrfs_path *path) { struct btrfs_key key; - return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path, - &key); + return btrfs_find_item(fs_root, path, inum, ioff, + BTRFS_INODE_ITEM_KEY, &key); } static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, struct btrfs_path *path, struct btrfs_key *found_key) { - return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path, - found_key); + return btrfs_find_item(fs_root, path, inum, ioff, + BTRFS_INODE_REF_KEY, found_key); } int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, @@ -1335,20 +1332,45 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) return ret; - ret = btrfs_previous_item(fs_info->extent_root, path, - 0, BTRFS_EXTENT_ITEM_KEY); - if (ret < 0) - return ret; - btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); + while (1) { + u32 nritems; + if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); + ret = btrfs_prev_leaf(fs_info->extent_root, path); + if (ret != 0) { + if (ret > 0) { + pr_debug("logical %llu is not within " + "any extent\n", logical); + ret = -ENOENT; + } + return ret; + } + } else { + path->slots[0]--; + } + nritems = btrfs_header_nritems(path->nodes[0]); + if (nritems == 0) { + pr_debug("logical %llu is not within any extent\n", + logical); + return -ENOENT; + } + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(path->nodes[0], found_key, + path->slots[0]); + if (found_key->type == BTRFS_EXTENT_ITEM_KEY || + found_key->type == BTRFS_METADATA_ITEM_KEY) + break; + } + if (found_key->type == BTRFS_METADATA_ITEM_KEY) size = fs_info->extent_root->leafsize; else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) size = found_key->offset; - if ((found_key->type != BTRFS_EXTENT_ITEM_KEY && - found_key->type != BTRFS_METADATA_ITEM_KEY) || - found_key->objectid > logical || + if (found_key->objectid > logical || found_key->objectid + size <= logical) { pr_debug("logical %llu is not within any extent\n", logical); return -ENOENT; @@ -1601,7 +1623,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, struct btrfs_key found_key; while (!ret) { - path->leave_spinning = 1; ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, &found_key); if (ret < 0) @@ -1614,9 +1635,12 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, parent = found_key.offset; slot = path->slots[0]; - eb = path->nodes[0]; - /* make sure we can use eb after releasing the path */ - atomic_inc(&eb->refs); + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + extent_buffer_get(eb); btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); @@ -1674,17 +1698,20 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, ++found; slot = path->slots[0]; - eb = path->nodes[0]; - /* make sure we can use eb after releasing the path */ - atomic_inc(&eb->refs); + eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!eb) { + ret = -ENOMEM; + break; + } + extent_buffer_get(eb); btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); cur_offset = 0; while (cur_offset < item_size) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ac0b39db27d1..8fed2125689e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -43,6 +43,7 @@ #define BTRFS_INODE_COPY_EVERYTHING 8 #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 +#define BTRFS_INODE_HAS_PROPS 11 /* in memory btrfs inode */ struct btrfs_inode { @@ -135,6 +136,9 @@ struct btrfs_inode { */ u64 index_cnt; + /* Cache the directory index number to speed the dir/file remove */ + u64 dir_index; + /* the fsync log has some corner cases that mean we have to check * directories to see if any unlinks have been done before * the directory was logged. See tree-log.c for all the diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 131d82800b3a..0e8388e72d8d 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -92,11 +92,11 @@ #include <linux/slab.h> #include <linux/buffer_head.h> #include <linux/mutex.h> -#include <linux/crc32c.h> #include <linux/genhd.h> #include <linux/blkdev.h> #include "ctree.h" #include "disk-io.h" +#include "hash.h" #include "transaction.h" #include "extent_io.h" #include "volumes.h" @@ -1456,10 +1456,14 @@ static int btrfsic_handle_extent_data( btrfsic_read_from_block_data(block_ctx, &file_extent_item, file_extent_item_offset, sizeof(struct btrfs_file_extent_item)); - next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item) + - btrfs_stack_file_extent_offset(&file_extent_item); - generation = btrfs_stack_file_extent_generation(&file_extent_item); - num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); + next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item); + if (btrfs_stack_file_extent_compression(&file_extent_item) == + BTRFS_COMPRESS_NONE) { + next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item); + num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); + } else { + num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item); + } generation = btrfs_stack_file_extent_generation(&file_extent_item); if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) @@ -1695,7 +1699,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, return -1; } bio->bi_bdev = block_ctx->dev->bdev; - bio->bi_sector = dev_bytenr >> 9; + bio->bi_iter.bi_sector = dev_bytenr >> 9; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], @@ -1819,7 +1823,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, size_t sublen = i ? PAGE_CACHE_SIZE : (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); - crc = crc32c(crc, data, sublen); + crc = btrfs_crc32c(crc, data, sublen); } btrfs_csum_final(crc, csum); if (memcmp(csum, h->csum, state->csum_size)) @@ -3013,7 +3017,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) int bio_is_patched; char **mapped_datav; - dev_bytenr = 512 * bio->bi_sector; + dev_bytenr = 512 * bio->bi_iter.bi_sector; bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) @@ -3021,8 +3025,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) "submit_bio(rw=0x%x, bi_vcnt=%u," " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", rw, bio->bi_vcnt, - (unsigned long long)bio->bi_sector, dev_bytenr, - bio->bi_bdev); + (unsigned long long)bio->bi_iter.bi_sector, + dev_bytenr, bio->bi_bdev); mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, GFP_NOFS); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1499b27b4186..b01fb6c527e3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -128,11 +128,10 @@ static int check_compressed_csum(struct inode *inode, kunmap_atomic(kaddr); if (csum != *cb_sum) { - printk(KERN_INFO "btrfs csum failed ino %llu " - "extent %llu csum %u " - "wanted %u mirror %d\n", - btrfs_ino(inode), disk_start, csum, *cb_sum, - cb->mirror_num); + btrfs_info(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu extent %llu csum %u wanted %u mirror %d", + btrfs_ino(inode), disk_start, csum, *cb_sum, + cb->mirror_num); ret = -EIO; goto fail; } @@ -172,7 +171,8 @@ static void end_compressed_bio_read(struct bio *bio, int err) goto out; inode = cb->inode; - ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9); + ret = check_compressed_csum(inode, cb, + (u64)bio->bi_iter.bi_sector << 9); if (ret) goto csum_failed; @@ -201,18 +201,16 @@ csum_failed: if (cb->errors) { bio_io_error(cb->orig_bio); } else { - int bio_index = 0; - struct bio_vec *bvec = cb->orig_bio->bi_io_vec; + int i; + struct bio_vec *bvec; /* * we have verified the checksum already, set page * checked so the end_io handlers know about it */ - while (bio_index < cb->orig_bio->bi_vcnt) { + bio_for_each_segment_all(bvec, cb->orig_bio, i) SetPageChecked(bvec->bv_page); - bvec++; - bio_index++; - } + bio_endio(cb->orig_bio, 0); } @@ -372,7 +370,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; - if (bio->bi_size) + if (bio->bi_iter.bi_size) ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, PAGE_CACHE_SIZE, bio, 0); @@ -412,7 +410,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); } if (bytes_left < PAGE_CACHE_SIZE) { - printk("bytes left %lu compress len %lu nr %lu\n", + btrfs_info(BTRFS_I(inode)->root->fs_info, + "bytes left %lu compress len %lu nr %lu", bytes_left, cb->compressed_len, cb->nr_pages); } bytes_left -= PAGE_CACHE_SIZE; @@ -506,7 +505,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (!em || last_offset < em->start || (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || - (em->block_start >> 9) != cb->orig_bio->bi_sector) { + (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, last_offset, end); unlock_page(page); @@ -552,7 +551,7 @@ next: * in it. We don't actually do IO on those pages but allocate new ones * to hold the compressed pages on disk. * - * bio->bi_sector points to the compressed extent on disk + * bio->bi_iter.bi_sector points to the compressed extent on disk * bio->bi_io_vec points to all of the inode pages * bio->bi_vcnt is a count of pages * @@ -573,7 +572,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct page *page; struct block_device *bdev; struct bio *comp_bio; - u64 cur_disk_byte = (u64)bio->bi_sector << 9; + u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9; u64 em_len; u64 em_start; struct extent_map *em; @@ -659,7 +658,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->mapping = inode->i_mapping; page->index = em_start >> PAGE_CACHE_SHIFT; - if (comp_bio->bi_size) + if (comp_bio->bi_iter.bi_size) ret = tree->ops->merge_bio_hook(READ, page, 0, PAGE_CACHE_SIZE, comp_bio, 0); @@ -687,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio, sums); BUG_ON(ret); /* -ENOMEM */ } - sums += (comp_bio->bi_size + root->sectorsize - 1) / - root->sectorsize; + sums += (comp_bio->bi_iter.bi_size + + root->sectorsize - 1) / root->sectorsize; ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); @@ -1011,6 +1010,8 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, bytes = min(bytes, working_bytes); kaddr = kmap_atomic(page_out); memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); + if (*pg_index == (vcnt - 1) && *pg_offset == 0) + memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes); kunmap_atomic(kaddr); flush_dcache_page(page_out); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 316136bd6dd7..cbd3a7d6fa68 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -39,9 +39,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); -static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); struct btrfs_path *btrfs_alloc_path(void) { @@ -475,6 +474,8 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * the index is the shifted logical of the *new* root node for root replace * operations, or the shifted logical of the affected block for all other * operations. + * + * Note: must be called with write lock (tree_mod_log_write_lock). */ static noinline int __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) @@ -483,24 +484,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node **new; struct rb_node *parent = NULL; struct tree_mod_elem *cur; - int ret = 0; BUG_ON(!tm); - tree_mod_log_write_lock(fs_info); - if (list_empty(&fs_info->tree_mod_seq_list)) { - tree_mod_log_write_unlock(fs_info); - /* - * Ok we no longer care about logging modifications, free up tm - * and return 0. Any callers shouldn't be using tm after - * calling tree_mod_log_insert, but if they do we can just - * change this to return a special error code to let the callers - * do their own thing. - */ - kfree(tm); - return 0; - } - spin_lock(&fs_info->tree_mod_seq_lock); tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); spin_unlock(&fs_info->tree_mod_seq_lock); @@ -518,18 +504,13 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) new = &((*new)->rb_left); else if (cur->seq > tm->seq) new = &((*new)->rb_right); - else { - ret = -EEXIST; - kfree(tm); - goto out; - } + else + return -EEXIST; } rb_link_node(&tm->node, parent, new); rb_insert_color(&tm->node, tm_root); -out: - tree_mod_log_write_unlock(fs_info); - return ret; + return 0; } /* @@ -545,19 +526,38 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, return 1; if (eb && btrfs_header_level(eb) == 0) return 1; + + tree_mod_log_write_lock(fs_info); + if (list_empty(&(fs_info)->tree_mod_seq_list)) { + tree_mod_log_write_unlock(fs_info); + return 1; + } + return 0; } -static inline int -__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) +/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ +static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + smp_mb(); + if (list_empty(&(fs_info)->tree_mod_seq_list)) + return 0; + if (eb && btrfs_header_level(eb) == 0) + return 0; + + return 1; +} + +static struct tree_mod_elem * +alloc_tree_mod_elem(struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) { struct tree_mod_elem *tm; tm = kzalloc(sizeof(*tm), flags); if (!tm) - return -ENOMEM; + return NULL; tm->index = eb->start >> PAGE_CACHE_SHIFT; if (op != MOD_LOG_KEY_ADD) { @@ -567,8 +567,9 @@ __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, tm->op = op; tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); + RB_CLEAR_NODE(&tm->node); - return __tree_mod_log_insert(fs_info, tm); + return tm; } static noinline int @@ -576,10 +577,27 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int slot, enum mod_log_op op, gfp_t flags) { - if (tree_mod_dont_log(fs_info, eb)) + struct tree_mod_elem *tm; + int ret; + + if (!tree_mod_need_log(fs_info, eb)) return 0; - return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); + tm = alloc_tree_mod_elem(eb, slot, op, flags); + if (!tm) + return -ENOMEM; + + if (tree_mod_dont_log(fs_info, eb)) { + kfree(tm); + return 0; + } + + ret = __tree_mod_log_insert(fs_info, tm); + tree_mod_log_write_unlock(fs_info); + if (ret) + kfree(tm); + + return ret; } static noinline int @@ -587,53 +605,95 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items, gfp_t flags) { - struct tree_mod_elem *tm; - int ret; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int ret = 0; int i; + int locked = 0; - if (tree_mod_dont_log(fs_info, eb)) + if (!tree_mod_need_log(fs_info, eb)) return 0; + tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags); + if (!tm_list) + return -ENOMEM; + + tm = kzalloc(sizeof(*tm), flags); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = MOD_LOG_MOVE_KEYS; + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, eb)) + goto free_tms; + locked = 1; + /* * When we override something during the move, we log these removals. * This can only happen when we move towards the beginning of the * buffer, i.e. dst_slot < src_slot. */ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); - BUG_ON(ret < 0); + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) + goto free_tms; } - tm = kzalloc(sizeof(*tm), flags); - if (!tm) - return -ENOMEM; + ret = __tree_mod_log_insert(fs_info, tm); + if (ret) + goto free_tms; + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); - tm->index = eb->start >> PAGE_CACHE_SHIFT; - tm->slot = src_slot; - tm->move.dst_slot = dst_slot; - tm->move.nr_items = nr_items; - tm->op = MOD_LOG_MOVE_KEYS; + return 0; +free_tms: + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + kfree(tm); - return __tree_mod_log_insert(fs_info, tm); + return ret; } -static inline void -__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +static inline int +__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) { - int i; - u32 nritems; + int i, j; int ret; - if (btrfs_header_level(eb) == 0) - return; - - nritems = btrfs_header_nritems(eb); for (i = nritems - 1; i >= 0; i--) { - ret = __tree_mod_log_insert_key(fs_info, eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); - BUG_ON(ret < 0); + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) { + for (j = nritems - 1; j > i; j--) + rb_erase(&tm_list[j]->node, + &fs_info->tree_mod_log); + return ret; + } } + + return 0; } static noinline int @@ -642,17 +702,38 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *new_root, gfp_t flags, int log_removal) { - struct tree_mod_elem *tm; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int ret = 0; + int i; - if (tree_mod_dont_log(fs_info, NULL)) + if (!tree_mod_need_log(fs_info, NULL)) return 0; - if (log_removal) - __tree_mod_log_free_eb(fs_info, old_root); + if (log_removal && btrfs_header_level(old_root) > 0) { + nritems = btrfs_header_nritems(old_root); + tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + flags); + if (!tm_list) { + ret = -ENOMEM; + goto free_tms; + } + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(old_root, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + } tm = kzalloc(sizeof(*tm), flags); - if (!tm) - return -ENOMEM; + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } tm->index = new_root->start >> PAGE_CACHE_SHIFT; tm->old_root.logical = old_root->start; @@ -660,7 +741,30 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, tm->generation = btrfs_header_generation(old_root); tm->op = MOD_LOG_ROOT_REPLACE; - return __tree_mod_log_insert(fs_info, tm); + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + + if (tm_list) + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); + if (!ret) + ret = __tree_mod_log_insert(fs_info, tm); + + tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return ret; + +free_tms: + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } + kfree(tm); + + return ret; } static struct tree_mod_elem * @@ -729,31 +833,75 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) return __tree_mod_log_search(fs_info, start, min_seq, 0); } -static noinline void +static noinline int tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) { - int ret; + int ret = 0; + struct tree_mod_elem **tm_list = NULL; + struct tree_mod_elem **tm_list_add, **tm_list_rem; int i; + int locked = 0; - if (tree_mod_dont_log(fs_info, NULL)) - return; + if (!tree_mod_need_log(fs_info, NULL)) + return 0; if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) - return; + return 0; + + tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + tm_list_add = tm_list; + tm_list_rem = tm_list + nr_items; for (i = 0; i < nr_items; i++) { - ret = __tree_mod_log_insert_key(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE, GFP_NOFS); - BUG_ON(ret < 0); - ret = __tree_mod_log_insert_key(fs_info, dst, - i + dst_offset, - MOD_LOG_KEY_ADD, - GFP_NOFS); - BUG_ON(ret < 0); + tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, + MOD_LOG_KEY_REMOVE, GFP_NOFS); + if (!tm_list_rem[i]) { + ret = -ENOMEM; + goto free_tms; + } + + tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, + MOD_LOG_KEY_ADD, GFP_NOFS); + if (!tm_list_add[i]) { + ret = -ENOMEM; + goto free_tms; + } } + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + locked = 1; + + for (i = 0; i < nr_items; i++) { + ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]); + if (ret) + goto free_tms; + ret = __tree_mod_log_insert(fs_info, tm_list_add[i]); + if (ret) + goto free_tms; + } + + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return ret; } static inline void @@ -772,18 +920,58 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, { int ret; - ret = __tree_mod_log_insert_key(fs_info, eb, slot, + ret = tree_mod_log_insert_key(fs_info, eb, slot, MOD_LOG_KEY_REPLACE, atomic ? GFP_ATOMIC : GFP_NOFS); BUG_ON(ret < 0); } -static noinline void +static noinline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int i; + int ret = 0; + + if (btrfs_header_level(eb) == 0) + return 0; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + nritems = btrfs_header_nritems(eb); + tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + if (tree_mod_dont_log(fs_info, eb)) - return; - __tree_mod_log_free_eb(fs_info, eb); + goto free_tms; + + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); + tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + + return ret; } static noinline void @@ -1041,8 +1229,13 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - if (last_ref) - tree_mod_log_free_eb(root->fs_info, buf); + if (last_ref) { + ret = tree_mod_log_free_eb(root->fs_info, buf); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1287,8 +1480,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq) old = read_tree_block(root, logical, blocksize, 0); if (WARN_ON(!old || !extent_buffer_uptodate(old))) { free_extent_buffer(old); - pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", - logical); + btrfs_warn(root->fs_info, + "failed to read tree block %llu from get_old_root", logical); } else { eb = btrfs_clone_extent_buffer(old); free_extent_buffer(old); @@ -2462,6 +2655,49 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key, return 0; } +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path, + u64 iobjectid, u64 ioff, u8 key_type, + struct btrfs_key *found_key) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_path *path; + + key.type = key_type; + key.objectid = iobjectid; + key.offset = ioff; + + if (found_path == NULL) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } else + path = found_path; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if ((ret < 0) || (found_key == NULL)) { + if (path != found_path) + btrfs_free_path(path); + return ret; + } + + eb = path->nodes[0]; + if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_root, path); + if (ret) + return ret; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); + if (found_key->type != key.type || + found_key->objectid != key.objectid) + return 1; + + return 0; +} + /* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf @@ -2495,6 +2731,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); + BUG_ON(!cow && ins_len); if (ins_len < 0) { lowest_unlock = 2; @@ -2603,8 +2840,6 @@ again: } } cow_done: - BUG_ON(!cow && ins_len); - p->nodes[level] = b; btrfs_clear_path_blocking(p, NULL, 0); @@ -2614,13 +2849,19 @@ cow_done: * It is safe to drop the lock on our parent before we * go through the expensive btree search on b. * - * If cow is true, then we might be changing slot zero, - * which may require changing the parent. So, we can't - * drop the lock until after we know which slot we're - * operating on. + * If we're inserting or deleting (ins_len != 0), then we might + * be changing slot zero, which may require changing the parent. + * So, we can't drop the lock until after we know which slot + * we're operating on. */ - if (!cow) - btrfs_unlock_up_safe(p, level + 1); + if (!ins_len && !p->keep_locks) { + int u = level + 1; + + if (u < BTRFS_MAX_LEVEL && p->locks[u]) { + btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); + p->locks[u] = 0; + } + } ret = key_search(b, key, level, &prev_cmp, &slot); @@ -2648,7 +2889,7 @@ cow_done: * which means we must have a write lock * on the parent */ - if (slot == 0 && cow && + if (slot == 0 && ins_len && write_lock_level < level + 1) { write_lock_level = level + 1; btrfs_release_path(p); @@ -2901,7 +3142,9 @@ again: if (ret < 0) return ret; if (!ret) { - p->slots[0] = btrfs_header_nritems(leaf) - 1; + leaf = p->nodes[0]; + if (p->slots[0] == btrfs_header_nritems(leaf)) + p->slots[0]--; return 0; } if (!return_any) @@ -3022,8 +3265,12 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); - tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items); + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, + push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3093,8 +3340,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans, (dst_nritems) * sizeof(struct btrfs_key_ptr)); - tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items); + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, + src_nritems - push_items, push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3295,7 +3546,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); + ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, + mid, c_nritems - mid); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), @@ -3362,8 +3618,8 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, int ret; ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); if (ret < 0) { - printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " - "used %d nritems %d\n", + btrfs_crit(root->fs_info, + "leaf free space ret %d, leaf data size %lu, used %d nritems %d", ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), leaf_space_used(leaf, 0, nritems), nritems); } @@ -3571,6 +3827,19 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; + if (path->slots[0] == left_nritems && !empty) { + /* Key greater than all keys in the leaf, right neighbor has + * enough room for it and we're not emptying our leaf to delete + * it, therefore use right neighbor to insert the new item and + * no need to touch/dirty our left leaft. */ + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1]++; + return 0; + } + return __push_leaf_right(trans, root, path, min_data_size, empty, right, free_space, left_nritems, min_slot); out_unlock: @@ -3887,14 +4156,17 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, int progress = 0; int slot; u32 nritems; + int space_needed = data_size; slot = path->slots[0]; + if (slot < btrfs_header_nritems(path->nodes[0])) + space_needed -= btrfs_leaf_free_space(root, path->nodes[0]); /* * try to push all the items after our slot into the * right leaf */ - ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); + ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -3914,7 +4186,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, /* try to push all the items before our slot into the next leaf */ slot = path->slots[0]; - ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); + ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -3958,13 +4230,18 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, /* first try to make some room by pushing left and right */ if (data_size && path->nodes[1]) { - wret = push_leaf_right(trans, root, path, data_size, - data_size, 0, 0); + int space_needed = data_size; + + if (slot < btrfs_header_nritems(l)) + space_needed -= btrfs_leaf_free_space(root, l); + + wret = push_leaf_right(trans, root, path, space_needed, + space_needed, 0, 0); if (wret < 0) return wret; if (wret) { - wret = push_leaf_left(trans, root, path, data_size, - data_size, 0, (u32)-1); + wret = push_leaf_left(trans, root, path, space_needed, + space_needed, 0, (u32)-1); if (wret < 0) return wret; } @@ -4432,7 +4709,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, BUG_ON(slot < 0); if (slot >= nritems) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d too large, nritems %d\n", + btrfs_crit(root->fs_info, "slot %d too large, nritems %d", slot, nritems); BUG_ON(1); } @@ -4495,7 +4772,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (btrfs_leaf_free_space(root, leaf) < total_size) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "not enough freespace need %u have %d\n", + btrfs_crit(root->fs_info, "not enough freespace need %u have %d", total_size, btrfs_leaf_free_space(root, leaf)); BUG(); } @@ -4505,7 +4782,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (old_data < data_end) { btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d", slot, old_data, data_end); BUG_ON(1); } @@ -4817,7 +5094,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * This may release the path, and so you may lose any locks held at the * time you call it. */ -static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) { struct btrfs_key key; struct btrfs_disk_key found_key; @@ -5240,7 +5517,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, if (!left_start_ctransid || !right_start_ctransid) { WARN(1, KERN_WARNING - "btrfs: btrfs_compare_tree detected " + "BTRFS: btrfs_compare_tree detected " "a change in one of the trees while " "iterating. This is probably a " "bug.\n"); @@ -5680,3 +5957,46 @@ int btrfs_previous_item(struct btrfs_root *root, } return 1; } + +/* + * search in extent tree to find a previous Metadata/Data extent item with + * min objecitd. + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid < min_objectid) + break; + if (found_key.type == BTRFS_EXTENT_ITEM_KEY || + found_key.type == BTRFS_METADATA_ITEM_KEY) + return 0; + if (found_key.objectid == min_objectid && + found_key.type < BTRFS_EXTENT_ITEM_KEY) + break; + } + return 1; +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7506825211a2..2c1a42ca519f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -521,9 +521,15 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) #define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) +#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL + #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ @@ -532,7 +538,12 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ - BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES) + +#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ + (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) +#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL /* * A leaf is full of items. offset and size tell us where to find @@ -1094,7 +1105,7 @@ struct btrfs_qgroup_limit_item { } __attribute__ ((__packed__)); struct btrfs_space_info { - u64 flags; + spinlock_t lock; u64 total_bytes; /* total bytes in the space, this doesn't take mirrors into account */ @@ -1104,14 +1115,25 @@ struct btrfs_space_info { transaction finishes */ u64 bytes_reserved; /* total bytes the allocator has reserved for current allocations */ - u64 bytes_readonly; /* total bytes that are read only */ - u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + + unsigned int full:1; /* indicates that we cannot allocate any more + chunks for this space */ + unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + + unsigned int flush:1; /* set if we are trying to make space */ + + unsigned int force_alloc; /* set if we need to force a chunk + alloc for this space */ + u64 disk_used; /* total bytes used on disk */ u64 disk_total; /* total bytes on disk, takes mirrors into account */ + u64 flags; + /* * bytes_pinned is kept in line with what is actually pinned, as in * we've called update_block_group and dropped the bytes_used counter @@ -1124,22 +1146,15 @@ struct btrfs_space_info { */ struct percpu_counter total_bytes_pinned; - unsigned int full:1; /* indicates that we cannot allocate any more - chunks for this space */ - unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ - - unsigned int flush:1; /* set if we are trying to make space */ - - unsigned int force_alloc; /* set if we need to force a chunk - alloc for this space */ - struct list_head list; + struct rw_semaphore groups_sem; /* for block groups in our same type */ struct list_head block_groups[BTRFS_NR_RAID_TYPES]; - spinlock_t lock; - struct rw_semaphore groups_sem; wait_queue_head_t wait; + + struct kobject kobj; + struct kobject block_group_kobjs[BTRFS_NR_RAID_TYPES]; }; #define BTRFS_BLOCK_RSV_GLOBAL 1 @@ -1346,6 +1361,7 @@ struct btrfs_fs_info { u64 generation; u64 last_trans_committed; + u64 avg_delayed_ref_runtime; /* * this is updated to the current trans every time a full commit @@ -1448,7 +1464,6 @@ struct btrfs_fs_info { spinlock_t tree_mod_seq_lock; atomic64_t tree_mod_seq; struct list_head tree_mod_seq_list; - struct seq_list tree_mod_seq_elem; /* this protects tree_mod_log */ rwlock_t tree_mod_log_lock; @@ -1515,6 +1530,8 @@ struct btrfs_fs_info { int thread_pool_size; struct kobject super_kobj; + struct kobject *space_info_kobj; + struct kobject *device_dir_kobj; struct completion kobj_unregister; int do_barriers; int closing; @@ -1643,6 +1660,10 @@ struct btrfs_fs_info { spinlock_t reada_lock; struct radix_tree_root reada_tree; + /* Extent buffer radix tree */ + spinlock_t buffer_lock; + struct radix_tree_root buffer_radix; + /* next backup root to be overwritten */ int backup_root_index; @@ -1795,6 +1816,12 @@ struct btrfs_root { struct list_head ordered_extents; struct list_head ordered_root; u64 nr_ordered_extents; + + /* + * Number of currently running SEND ioctls to prevent + * manipulation with the read-only status via SUBVOL_SETFLAGS + */ + int send_in_progress; }; struct btrfs_ioctl_defrag_range_args { @@ -1997,6 +2024,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) +#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) @@ -2925,6 +2953,10 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, struct btrfs_file_extent_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, + struct btrfs_file_extent_item, disk_num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, + struct btrfs_file_extent_item, compression, 8); static inline unsigned long btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) @@ -2958,15 +2990,6 @@ BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, other_encoding, 16); -/* this returns the number of file bytes represented by the inline item. - * If an item is compressed, this is the uncompressed size - */ -static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, - struct btrfs_file_extent_item *e) -{ - return btrfs_file_extent_ram_bytes(eb, e); -} - /* * this returns the number of bytes used by the item on disk, minus the * size of any extent headers. If a file is compressed on disk, this is @@ -2980,6 +3003,32 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } +/* this returns the number of file bytes represented by the inline item. + * If an item is compressed, this is the uncompressed size + */ +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, + int slot, + struct btrfs_file_extent_item *fi) +{ + struct btrfs_map_token token; + + btrfs_init_map_token(&token); + /* + * return the space used on disk if this item isn't + * compressed or encoded + */ + if (btrfs_token_file_extent_compression(eb, fi, &token) == 0 && + btrfs_token_file_extent_encryption(eb, fi, &token) == 0 && + btrfs_token_file_extent_other_encoding(eb, fi, &token) == 0) { + return btrfs_file_extent_inline_item_len(eb, + btrfs_item_nr(slot)); + } + + /* otherwise use the ram bytes field */ + return btrfs_token_file_extent_ram_bytes(eb, fi, &token); +} + + /* btrfs_dev_stats_item */ static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, struct btrfs_dev_stats_item *ptr, @@ -3143,6 +3192,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); @@ -3163,6 +3214,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +int get_block_group_index(struct btrfs_block_group_cache *cache); struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, @@ -3301,6 +3353,8 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); @@ -3350,6 +3404,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key); +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, + u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key); int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); @@ -3399,6 +3455,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); static inline int btrfs_next_old_item(struct btrfs_root *root, @@ -3563,12 +3620,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 *index); -int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int mod, - u64 *ret_index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3676,7 +3727,9 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, u64 new_dirid); + struct btrfs_root *new_root, + struct btrfs_root *parent_root, + u64 new_dirid); int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); @@ -3745,7 +3798,10 @@ extern const struct file_operations btrfs_file_operations; int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache); + u64 *drop_end, int drop_cache, + int replace_extent, + u32 extent_item_size, + int *key_inserted); int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); @@ -3764,6 +3820,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, /* sysfs.c */ int btrfs_init_sysfs(void); void btrfs_exit_sysfs(void); +int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info); /* xattr.c */ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); @@ -3796,14 +3854,20 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) #define btrfs_info(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_INFO fmt, ##args) + +#ifdef DEBUG #define btrfs_debug(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#else +#define btrfs_debug(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) +#endif #ifdef CONFIG_BTRFS_ASSERT static inline void assfail(char *expr, char *file, int line) { - printk(KERN_ERR "BTRFS assertion failed: %s, file: %s, line: %d", + pr_err("BTRFS: assertion failed: %s, file: %s, line: %d", expr, file, line); BUG(); } @@ -3841,7 +3905,7 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, if (!(features & flag)) { features |= flag; btrfs_set_super_incompat_flags(disk_super, features); - printk(KERN_INFO "btrfs: setting %llu feature flag\n", + btrfs_info(fs_info, "setting %llu feature flag", flag); } spin_unlock(&fs_info->super_lock); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 8d292fbae659..451b00c86f6c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -55,8 +55,7 @@ static inline void btrfs_init_delayed_node( delayed_node->inode_id = inode_id; atomic_set(&delayed_node->refs, 0); delayed_node->count = 0; - delayed_node->in_list = 0; - delayed_node->inode_dirty = 0; + delayed_node->flags = 0; delayed_node->ins_root = RB_ROOT; delayed_node->del_root = RB_ROOT; mutex_init(&delayed_node->mutex); @@ -172,7 +171,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, int mod) { spin_lock(&root->lock); - if (node->in_list) { + if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { if (!list_empty(&node->p_list)) list_move_tail(&node->p_list, &root->prepare_list); else if (mod) @@ -182,7 +181,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, list_add_tail(&node->p_list, &root->prepare_list); atomic_inc(&node->refs); /* inserted into list */ root->nodes++; - node->in_list = 1; + set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } spin_unlock(&root->lock); } @@ -192,13 +191,13 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, struct btrfs_delayed_node *node) { spin_lock(&root->lock); - if (node->in_list) { + if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { root->nodes--; atomic_dec(&node->refs); /* not in the list */ list_del_init(&node->n_list); if (!list_empty(&node->p_list)) list_del_init(&node->p_list); - node->in_list = 0; + clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } spin_unlock(&root->lock); } @@ -231,7 +230,8 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( delayed_root = node->root->fs_info->delayed_root; spin_lock(&delayed_root->lock); - if (!node->in_list) { /* not in the list */ + if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { + /* not in the list */ if (list_empty(&delayed_root->node_list)) goto out; p = delayed_root->node_list.next; @@ -1004,9 +1004,10 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) { struct btrfs_delayed_root *delayed_root; - if (delayed_node && delayed_node->inode_dirty) { + if (delayed_node && + test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { BUG_ON(!delayed_node->root); - delayed_node->inode_dirty = 0; + clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count--; delayed_root = delayed_node->root->fs_info->delayed_root; @@ -1014,6 +1015,18 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) } } +static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) +{ + struct btrfs_delayed_root *delayed_root; + + ASSERT(delayed_node->root); + clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); + delayed_node->count--; + + delayed_root = delayed_node->root->fs_info->delayed_root; + finish_one_item(delayed_root); +} + static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -1022,13 +1035,19 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + int mod; int ret; key.objectid = node->inode_id; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - ret = btrfs_lookup_inode(trans, root, path, &key, 1); + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) + mod = -1; + else + mod = 1; + + ret = btrfs_lookup_inode(trans, root, path, &key, mod); if (ret > 0) { btrfs_release_path(path); return -ENOENT; @@ -1036,19 +1055,58 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, return ret; } - btrfs_unlock_up_safe(path, 1); leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, sizeof(struct btrfs_inode_item)); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); + if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) + goto no_iref; + + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) + goto search; +again: + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != node->inode_id) + goto out; + + if (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY) + goto out; + + /* + * Delayed iref deletion is for the inode who has only one link, + * so there is only one iref. The case that several irefs are + * in the same item doesn't exist. + */ + btrfs_del_item(trans, root, path); +out: + btrfs_release_delayed_iref(node); +no_iref: + btrfs_release_path(path); +err_out: btrfs_delayed_inode_release_metadata(root, node); btrfs_release_delayed_inode(node); - return 0; + return ret; + +search: + btrfs_release_path(path); + + btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.offset = -1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto err_out; + ASSERT(ret); + + ret = 0; + leaf = path->nodes[0]; + path->slots[0]--; + goto again; } static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, @@ -1059,7 +1117,7 @@ static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, int ret; mutex_lock(&node->mutex); - if (!node->inode_dirty) { + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) { mutex_unlock(&node->mutex); return 0; } @@ -1203,7 +1261,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode) return 0; mutex_lock(&delayed_node->mutex); - if (!delayed_node->inode_dirty) { + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); return 0; @@ -1227,7 +1285,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode) trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; mutex_lock(&delayed_node->mutex); - if (delayed_node->inode_dirty) + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) ret = __btrfs_update_delayed_inode(trans, delayed_node->root, path, delayed_node); else @@ -1300,36 +1358,9 @@ again: trans->block_rsv = &root->fs_info->delayed_block_rsv; __btrfs_commit_inode_delayed_items(trans, path, delayed_node); - /* - * Maybe new delayed items have been inserted, so we need requeue - * the work. Besides that, we must dequeue the empty delayed nodes - * to avoid the race between delayed items balance and the worker. - * The race like this: - * Task1 Worker thread - * count == 0, needn't requeue - * also needn't insert the - * delayed node into prepare - * list again. - * add lots of delayed items - * queue the delayed node - * already in the list, - * and not in the prepare - * list, it means the delayed - * node is being dealt with - * by the worker. - * do delayed items balance - * the delayed node is being - * dealt with by the worker - * now, just wait. - * the worker goto idle. - * Task1 will sleep until the transaction is commited. - */ - mutex_lock(&delayed_node->mutex); - btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node); - mutex_unlock(&delayed_node->mutex); trans->block_rsv = block_rsv; - btrfs_end_transaction_dmeta(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty_nodelay(root); release_path: @@ -1376,52 +1407,41 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root) WARN_ON(btrfs_first_delayed_node(delayed_root)); } -static int refs_newer(struct btrfs_delayed_root *delayed_root, - int seq, int count) +static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) { int val = atomic_read(&delayed_root->items_seq); - if (val < seq || val >= seq + count) + if (val < seq || val >= seq + BTRFS_DELAYED_BATCH) + return 1; + + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) return 1; + return 0; } void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; - int seq; delayed_root = btrfs_get_delayed_root(root); if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) return; - seq = atomic_read(&delayed_root->items_seq); - if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { + int seq; int ret; - DEFINE_WAIT(__wait); + + seq = atomic_read(&delayed_root->items_seq); ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); if (ret) return; - while (1) { - prepare_to_wait(&delayed_root->wait, &__wait, - TASK_INTERRUPTIBLE); - - if (refs_newer(delayed_root, seq, - BTRFS_DELAYED_BATCH) || - atomic_read(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND) { - break; - } - if (!signal_pending(current)) - schedule(); - else - break; - } - finish_wait(&delayed_root->wait, &__wait); + wait_event_interruptible(delayed_root->wait, + could_end_wait(delayed_root, seq)); + return; } btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); @@ -1472,9 +1492,9 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); if (unlikely(ret)) { - printk(KERN_ERR "err add delayed dir index item(name: %.*s) " + btrfs_err(root->fs_info, "err add delayed dir index item(name: %.*s) " "into the insertion tree of the delayed node" - "(root id: %llu, inode id: %llu, errno: %d)\n", + "(root id: %llu, inode id: %llu, errno: %d)", name_len, name, delayed_node->root->objectid, delayed_node->inode_id, ret); BUG(); @@ -1544,9 +1564,9 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&node->mutex); ret = __btrfs_add_delayed_deletion_item(node, item); if (unlikely(ret)) { - printk(KERN_ERR "err add delayed dir index item(index: %llu) " + btrfs_err(root->fs_info, "err add delayed dir index item(index: %llu) " "into the deletion tree of the delayed node" - "(root id: %llu, inode id: %llu, errno: %d)\n", + "(root id: %llu, inode id: %llu, errno: %d)", index, node->root->objectid, node->inode_id, ret); BUG(); @@ -1759,7 +1779,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) return -ENOENT; mutex_lock(&delayed_node->mutex); - if (!delayed_node->inode_dirty) { + if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); return -ENOENT; @@ -1810,7 +1830,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, return PTR_ERR(delayed_node); mutex_lock(&delayed_node->mutex); - if (delayed_node->inode_dirty) { + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { fill_stack_inode_item(trans, &delayed_node->inode_item, inode); goto release_node; } @@ -1821,7 +1841,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; fill_stack_inode_item(trans, &delayed_node->inode_item, inode); - delayed_node->inode_dirty = 1; + set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count++; atomic_inc(&root->fs_info->delayed_root->items); release_node: @@ -1830,6 +1850,41 @@ release_node: return ret; } +int btrfs_delayed_delete_inode_ref(struct inode *inode) +{ + struct btrfs_delayed_node *delayed_node; + + delayed_node = btrfs_get_or_create_delayed_node(inode); + if (IS_ERR(delayed_node)) + return PTR_ERR(delayed_node); + + /* + * We don't reserve space for inode ref deletion is because: + * - We ONLY do async inode ref deletion for the inode who has only + * one link(i_nlink == 1), it means there is only one inode ref. + * And in most case, the inode ref and the inode item are in the + * same leaf, and we will deal with them at the same time. + * Since we are sure we will reserve the space for the inode item, + * it is unnecessary to reserve space for inode ref deletion. + * - If the inode ref and the inode item are not in the same leaf, + * We also needn't worry about enospc problem, because we reserve + * much more space for the inode update than it needs. + * - At the worst, we can steal some space from the global reservation. + * It is very rare. + */ + mutex_lock(&delayed_node->mutex); + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) + goto release_node; + + set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); + delayed_node->count++; + atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items); +release_node: + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) { struct btrfs_root *root = delayed_node->root; @@ -1852,7 +1907,10 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) btrfs_release_delayed_item(prev_item); } - if (delayed_node->inode_dirty) { + if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) + btrfs_release_delayed_iref(delayed_node); + + if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { btrfs_delayed_inode_release_metadata(root, delayed_node); btrfs_release_delayed_inode(delayed_node); } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index a4b38f934d14..f70119f25421 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -48,6 +48,10 @@ struct btrfs_delayed_root { wait_queue_head_t wait; }; +#define BTRFS_DELAYED_NODE_IN_LIST 0 +#define BTRFS_DELAYED_NODE_INODE_DIRTY 1 +#define BTRFS_DELAYED_NODE_DEL_IREF 2 + struct btrfs_delayed_node { u64 inode_id; u64 bytes_reserved; @@ -65,8 +69,7 @@ struct btrfs_delayed_node { struct btrfs_inode_item inode_item; atomic_t refs; u64 index_cnt; - bool in_list; - bool inode_dirty; + unsigned long flags; int count; }; @@ -125,6 +128,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); int btrfs_fill_inode(struct inode *inode, u32 *rdev); +int btrfs_delayed_delete_inode_ref(struct inode *inode); /* Used for drop dead root */ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e4d467be2dd4..f3bff89eecf0 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -161,35 +161,61 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, return NULL; } +/* insert a new ref to head ref rbtree */ +static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_head *entry; + struct btrfs_delayed_ref_head *ins; + u64 bytenr; + + ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); + bytenr = ins->node.bytenr; + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, + href_node); + + if (bytenr < entry->node.bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->node.bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + /* * find an head entry based on bytenr. This returns the delayed ref * head if it was able to find one, or NULL if nothing was in that spot. * If return_bigger is given, the next bigger entry is returned if no exact * match is found. */ -static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, - u64 bytenr, - struct btrfs_delayed_ref_node **last, - int return_bigger) +static struct btrfs_delayed_ref_head * +find_ref_head(struct rb_root *root, u64 bytenr, + struct btrfs_delayed_ref_head **last, int return_bigger) { struct rb_node *n; - struct btrfs_delayed_ref_node *entry; + struct btrfs_delayed_ref_head *entry; int cmp = 0; again: n = root->rb_node; entry = NULL; while (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - WARN_ON(!entry->in_tree); + entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); if (last) *last = entry; - if (bytenr < entry->bytenr) + if (bytenr < entry->node.bytenr) cmp = -1; - else if (bytenr > entry->bytenr) - cmp = 1; - else if (!btrfs_delayed_ref_is_head(entry)) + else if (bytenr > entry->node.bytenr) cmp = 1; else cmp = 0; @@ -203,12 +229,12 @@ again: } if (entry && return_bigger) { if (cmp > 0) { - n = rb_next(&entry->rb_node); + n = rb_next(&entry->href_node); if (!n) n = rb_first(root); - entry = rb_entry(n, struct btrfs_delayed_ref_node, - rb_node); - bytenr = entry->bytenr; + entry = rb_entry(n, struct btrfs_delayed_ref_head, + href_node); + bytenr = entry->node.bytenr; return_bigger = 0; goto again; } @@ -243,33 +269,38 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { - rb_erase(&ref->rb_node, &delayed_refs->root); + if (btrfs_delayed_ref_is_head(ref)) { + head = btrfs_delayed_node_to_head(ref); + rb_erase(&head->href_node, &delayed_refs->href_root); + } else { + assert_spin_locked(&head->lock); + rb_erase(&ref->rb_node, &head->ref_root); + } ref->in_tree = 0; btrfs_put_delayed_ref(ref); - delayed_refs->num_entries--; + atomic_dec(&delayed_refs->num_entries); if (trans->delayed_ref_updates) trans->delayed_ref_updates--; } static int merge_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) { struct rb_node *node; - int merged = 0; int mod = 0; int done = 0; - node = rb_prev(&ref->rb_node); - while (node) { + node = rb_next(&ref->rb_node); + while (!done && node) { struct btrfs_delayed_ref_node *next; next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_prev(node); - if (next->bytenr != ref->bytenr) - break; + node = rb_next(node); if (seq && next->seq >= seq) break; if (comp_entry(ref, next, 0)) @@ -289,12 +320,11 @@ static int merge_ref(struct btrfs_trans_handle *trans, mod = -next->ref_mod; } - merged++; - drop_delayed_ref(trans, delayed_refs, next); + drop_delayed_ref(trans, delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, ref); - break; + drop_delayed_ref(trans, delayed_refs, head, ref); + done = 1; } else { /* * You can't have multiples of the same ref on a tree @@ -303,13 +333,8 @@ static int merge_ref(struct btrfs_trans_handle *trans, WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || ref->type == BTRFS_SHARED_BLOCK_REF_KEY); } - - if (done) - break; - node = rb_prev(&ref->rb_node); } - - return merged; + return done; } void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, @@ -320,6 +345,14 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct rb_node *node; u64 seq = 0; + assert_spin_locked(&head->lock); + /* + * We don't have too much refs to merge in the case of delayed data + * refs. + */ + if (head->is_data) + return; + spin_lock(&fs_info->tree_mod_seq_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { struct seq_list *elem; @@ -330,22 +363,19 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, } spin_unlock(&fs_info->tree_mod_seq_lock); - node = rb_prev(&head->node.rb_node); + node = rb_first(&head->ref_root); while (node) { struct btrfs_delayed_ref_node *ref; ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr != head->node.bytenr) - break; - /* We can't merge refs that are outside of our seq count */ if (seq && ref->seq >= seq) break; - if (merge_ref(trans, delayed_refs, ref, seq)) - node = rb_prev(&head->node.rb_node); + if (merge_ref(trans, delayed_refs, head, ref, seq)) + node = rb_first(&head->ref_root); else - node = rb_prev(node); + node = rb_next(&ref->rb_node); } } @@ -373,71 +403,52 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, return ret; } -int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 start) +struct btrfs_delayed_ref_head * +btrfs_select_ref_head(struct btrfs_trans_handle *trans) { - int count = 0; struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *head; + u64 start; + bool loop = false; delayed_refs = &trans->transaction->delayed_refs; - if (start == 0) { - node = rb_first(&delayed_refs->root); - } else { - ref = NULL; - find_ref_head(&delayed_refs->root, start + 1, &ref, 1); - if (ref) { - node = &ref->rb_node; - } else - node = rb_first(&delayed_refs->root); - } + again: - while (node && count < 32) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - head = btrfs_delayed_node_to_head(ref); - if (list_empty(&head->cluster)) { - list_add_tail(&head->cluster, cluster); - delayed_refs->run_delayed_start = - head->node.bytenr; - count++; - - WARN_ON(delayed_refs->num_heads_ready == 0); - delayed_refs->num_heads_ready--; - } else if (count) { - /* the goal of the clustering is to find extents - * that are likely to end up in the same extent - * leaf on disk. So, we don't want them spread - * all over the tree. Stop now if we've hit - * a head that was already in use - */ - break; - } - } - node = rb_next(node); - } - if (count) { - return 0; - } else if (start) { - /* - * we've gone to the end of the rbtree without finding any - * clusters. start from the beginning and try again - */ + start = delayed_refs->run_delayed_start; + head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); + if (!head && !loop) { + delayed_refs->run_delayed_start = 0; start = 0; - node = rb_first(&delayed_refs->root); - goto again; + loop = true; + head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); + if (!head) + return NULL; + } else if (!head && loop) { + return NULL; } - return 1; -} -void btrfs_release_ref_cluster(struct list_head *cluster) -{ - struct list_head *pos, *q; + while (head->processing) { + struct rb_node *node; + + node = rb_next(&head->href_node); + if (!node) { + if (loop) + return NULL; + delayed_refs->run_delayed_start = 0; + start = 0; + loop = true; + goto again; + } + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + } - list_for_each_safe(pos, q, cluster) - list_del_init(pos); + head->processing = 1; + WARN_ON(delayed_refs->num_heads_ready == 0); + delayed_refs->num_heads_ready--; + delayed_refs->run_delayed_start = head->node.bytenr + + head->node.num_bytes; + return head; } /* @@ -451,6 +462,7 @@ void btrfs_release_ref_cluster(struct list_head *cluster) static noinline void update_existing_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *existing, struct btrfs_delayed_ref_node *update) { @@ -463,7 +475,7 @@ update_existing_ref(struct btrfs_trans_handle *trans, */ existing->ref_mod--; if (existing->ref_mod == 0) - drop_delayed_ref(trans, delayed_refs, existing); + drop_delayed_ref(trans, delayed_refs, head, existing); else WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || existing->type == BTRFS_SHARED_BLOCK_REF_KEY); @@ -533,9 +545,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, } } /* - * update the reference mod on the head to reflect this new operation + * update the reference mod on the head to reflect this new operation, + * only need the lock for this case cause we could be processing it + * currently, for refs we just added we know we're a-ok. */ + spin_lock(&existing_ref->lock); existing->ref_mod += update->ref_mod; + spin_unlock(&existing_ref->lock); } /* @@ -543,13 +559,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * this does all the dirty work in terms of maintaining the correct * overall modification count. */ -static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, - int action, int is_data) +static noinline struct btrfs_delayed_ref_head * +add_delayed_ref_head(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, int action, int is_data) { - struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; @@ -596,38 +612,43 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; + head_ref->ref_root = RB_ROOT; + head_ref->processing = 0; - INIT_LIST_HEAD(&head_ref->cluster); + spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); trace_add_delayed_ref_head(ref, head_ref, action); - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - + existing = htree_insert(&delayed_refs->href_root, + &head_ref->href_node); if (existing) { - update_existing_head_ref(existing, ref); + update_existing_head_ref(&existing->node, ref); /* * we've updated the existing ref, free the newly * allocated ref */ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + head_ref = existing; } else { delayed_refs->num_heads++; delayed_refs->num_heads_ready++; - delayed_refs->num_entries++; + atomic_inc(&delayed_refs->num_entries); trans->delayed_ref_updates++; } + return head_ref; } /* * helper to insert a delayed tree ref into the rbtree. */ -static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action, - int for_cow) +static noinline void +add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, int level, + int action, int for_cow) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; @@ -663,30 +684,33 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_tree_ref(ref, full_ref, action); - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - + spin_lock(&head_ref->lock); + existing = tree_insert(&head_ref->ref_root, &ref->rb_node); if (existing) { - update_existing_ref(trans, delayed_refs, existing, ref); + update_existing_ref(trans, delayed_refs, head_ref, existing, + ref); /* * we've updated the existing ref, free the newly * allocated ref */ kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); } else { - delayed_refs->num_entries++; + atomic_inc(&delayed_refs->num_entries); trans->delayed_ref_updates++; } + spin_unlock(&head_ref->lock); } /* * helper to insert a delayed data ref into the rbtree. */ -static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, u64 owner, u64 offset, - int action, int for_cow) +static noinline void +add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_delayed_ref_node *ref, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, u64 owner, + u64 offset, int action, int for_cow) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; @@ -724,19 +748,21 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_data_ref(ref, full_ref, action); - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - + spin_lock(&head_ref->lock); + existing = tree_insert(&head_ref->ref_root, &ref->rb_node); if (existing) { - update_existing_ref(trans, delayed_refs, existing, ref); + update_existing_ref(trans, delayed_refs, head_ref, existing, + ref); /* * we've updated the existing ref, free the newly * allocated ref */ kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); } else { - delayed_refs->num_entries++; + atomic_inc(&delayed_refs->num_entries); trans->delayed_ref_updates++; } + spin_unlock(&head_ref->lock); } /* @@ -775,10 +801,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, action, 0); + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + bytenr, num_bytes, action, 0); - add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, + add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, for_cow); spin_unlock(&delayed_refs->lock); @@ -823,10 +849,10 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, action, 1); + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + bytenr, num_bytes, action, 1); - add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, + add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, action, for_cow); spin_unlock(&delayed_refs->lock); @@ -869,14 +895,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) { - struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0); - if (ref) - return btrfs_delayed_node_to_head(ref); - return NULL; + return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0); } void btrfs_delayed_ref_exit(void) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 70b962cc177d..4ba9b93022ff 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -81,7 +81,10 @@ struct btrfs_delayed_ref_head { */ struct mutex mutex; - struct list_head cluster; + spinlock_t lock; + struct rb_root ref_root; + + struct rb_node href_node; struct btrfs_delayed_extent_op *extent_op; /* @@ -98,6 +101,7 @@ struct btrfs_delayed_ref_head { */ unsigned int must_insert_reserved:1; unsigned int is_data:1; + unsigned int processing:1; }; struct btrfs_delayed_tree_ref { @@ -116,7 +120,8 @@ struct btrfs_delayed_data_ref { }; struct btrfs_delayed_ref_root { - struct rb_root root; + /* head ref rbtree */ + struct rb_root href_root; /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; @@ -124,7 +129,7 @@ struct btrfs_delayed_ref_root { /* how many delayed ref updates we've queued, used by the * throttling code */ - unsigned long num_entries; + atomic_t num_entries; /* total number of head nodes in tree */ unsigned long num_heads; @@ -133,15 +138,6 @@ struct btrfs_delayed_ref_root { unsigned long num_heads_ready; /* - * bumped when someone is making progress on the delayed - * refs, so that other procs know they are just adding to - * contention intead of helping - */ - atomic_t procs_running_refs; - atomic_t ref_seq; - wait_queue_head_t wait; - - /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need * to be run right away @@ -226,9 +222,9 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) mutex_unlock(&head->mutex); } -int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 search_start); -void btrfs_release_ref_cluster(struct list_head *cluster); + +struct btrfs_delayed_ref_head * +btrfs_select_ref_head(struct btrfs_trans_handle *trans); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2cfc3dfff64f..564c92638b20 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -102,7 +102,8 @@ no_valid_dev_replace_entry_found: ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); if (item_size != sizeof(struct btrfs_dev_replace_item)) { - pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); + btrfs_warn(fs_info, + "dev_replace entry found has unexpected size, ignore entry"); goto no_valid_dev_replace_entry_found; } @@ -145,13 +146,19 @@ no_valid_dev_replace_entry_found: if (!dev_replace->srcdev && !btrfs_test_opt(dev_root, DEGRADED)) { ret = -EIO; - pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", - src_devid); + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", + src_devid); } if (!dev_replace->tgtdev && !btrfs_test_opt(dev_root, DEGRADED)) { ret = -EIO; - pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", + btrfs_warn(fs_info, + "cannot mount because device replace operation is ongoing and"); + btrfs_warn(fs_info, + "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?", BTRFS_DEV_REPLACE_DEVID); } if (dev_replace->tgtdev) { @@ -210,7 +217,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, } ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - pr_warn("btrfs: error %d while searching for dev_replace item!\n", + btrfs_warn(fs_info, "error %d while searching for dev_replace item!", ret); goto out; } @@ -230,7 +237,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - pr_warn("btrfs: delete too small dev_replace item failed %d!\n", + btrfs_warn(fs_info, "delete too small dev_replace item failed %d!", ret); goto out; } @@ -243,7 +250,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - pr_warn("btrfs: insert dev_replace item failed %d!\n", + btrfs_warn(fs_info, "insert dev_replace item failed %d!", ret); goto out; } @@ -305,7 +312,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *src_device = NULL; if (btrfs_fs_incompat(fs_info, RAID56)) { - pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n"); + btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); return -EINVAL; } @@ -325,7 +332,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, &tgt_device); if (ret) { - pr_err("btrfs: target device %s is invalid!\n", + btrfs_err(fs_info, "target device %s is invalid!", args->start.tgtdev_name); mutex_unlock(&fs_info->volume_mutex); return -EINVAL; @@ -341,7 +348,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, } if (tgt_device->total_bytes < src_device->total_bytes) { - pr_err("btrfs: target device is smaller than source device!\n"); + btrfs_err(fs_info, "target device is smaller than source device!"); ret = -EINVAL; goto leave_no_lock; } @@ -366,7 +373,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->tgtdev = tgt_device; printk_in_rcu(KERN_INFO - "btrfs: dev_replace from %s (devid %llu) to %s started\n", + "BTRFS: dev_replace from %s (devid %llu) to %s started\n", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -489,7 +496,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, if (scrub_ret) { printk_in_rcu(KERN_ERR - "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -504,7 +511,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } printk_in_rcu(KERN_INFO - "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", + "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -699,7 +706,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; - pr_info("btrfs: suspending dev_replace for unmount\n"); + btrfs_info(fs_info, "suspending dev_replace for unmount"); break; } @@ -728,8 +735,9 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) break; } if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { - pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" - "btrfs: you may cancel the operation after 'mount -o degraded'\n"); + btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing"); + btrfs_info(fs_info, + "you may cancel the operation after 'mount -o degraded'"); btrfs_dev_replace_unlock(dev_replace); return 0; } @@ -755,14 +763,14 @@ static int btrfs_dev_replace_kthread(void *data) kfree(status_args); do_div(progress, 10); printk_in_rcu(KERN_INFO - "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", - dev_replace->srcdev->missing ? "<missing disk>" : - rcu_str_deref(dev_replace->srcdev->name), - dev_replace->srcdev->devid, - dev_replace->tgtdev ? - rcu_str_deref(dev_replace->tgtdev->name) : - "<missing target disk>", - (unsigned int)progress); + "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", + dev_replace->srcdev->missing ? "<missing disk>" : + rcu_str_deref(dev_replace->srcdev->name), + dev_replace->srcdev->devid, + dev_replace->tgtdev ? + rcu_str_deref(dev_replace->tgtdev->name) : + "<missing target disk>", + (unsigned int)progress); } btrfs_dev_replace_continue_on_mount(fs_info); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index c031ea3fd70f..a0691df5dcea 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -261,7 +261,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, * see if there is room in the item to insert this * name */ - data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item); + data_size = sizeof(*di) + name_len; leaf = path->nodes[0]; slot = path->slots[0]; if (data_size + btrfs_item_size_nr(leaf, slot) + @@ -459,7 +459,7 @@ int verify_dir_item(struct btrfs_root *root, u8 type = btrfs_dir_type(leaf, dir_item); if (type >= BTRFS_FT_MAX) { - printk(KERN_CRIT "btrfs: invalid dir item type: %d\n", + btrfs_crit(root->fs_info, "invalid dir item type: %d", (int)type); return 1; } @@ -468,7 +468,7 @@ int verify_dir_item(struct btrfs_root *root, namelen = XATTR_NAME_MAX; if (btrfs_dir_name_len(leaf, dir_item) > namelen) { - printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n", + btrfs_crit(root->fs_info, "invalid dir item name len: %u", (unsigned)btrfs_dir_data_len(leaf, dir_item)); return 1; } @@ -476,7 +476,7 @@ int verify_dir_item(struct btrfs_root *root, /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ if ((btrfs_dir_data_len(leaf, dir_item) + btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) { - printk(KERN_CRIT "btrfs: invalid dir item name + data len: %u + %u\n", + btrfs_crit(root->fs_info, "invalid dir item name + data len: %u + %u", (unsigned)btrfs_dir_name_len(leaf, dir_item), (unsigned)btrfs_dir_data_len(leaf, dir_item)); return 1; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8072cfa8a3b1..81ea55314b1f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -26,7 +26,6 @@ #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/freezer.h> -#include <linux/crc32c.h> #include <linux/slab.h> #include <linux/migrate.h> #include <linux/ratelimit.h> @@ -35,6 +34,7 @@ #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" +#include "hash.h" #include "transaction.h" #include "btrfs_inode.h" #include "volumes.h" @@ -48,6 +48,7 @@ #include "rcu-string.h" #include "dev-replace.h" #include "raid56.h" +#include "sysfs.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -243,7 +244,7 @@ out: u32 btrfs_csum_data(char *data, u32 seed, size_t len) { - return crc32c(seed, data, len); + return btrfs_crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, char *result) @@ -299,11 +300,11 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk_ratelimited(KERN_INFO "btrfs: %s checksum verify " - "failed on %llu wanted %X found %X " - "level %d\n", - root->fs_info->sb->s_id, buf->start, - val, found, btrfs_header_level(buf)); + printk_ratelimited(KERN_INFO + "BTRFS: %s checksum verify failed on %llu wanted %X found %X " + "level %d\n", + root->fs_info->sb->s_id, buf->start, + val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) kfree(result); return 1; @@ -382,13 +383,14 @@ static int btrfs_check_super_csum(char *raw_disk_sb) ret = 1; if (ret && btrfs_super_generation(disk_sb) < 10) { - printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n"); + printk(KERN_WARNING + "BTRFS: super block crcs don't match, older mkfs detected\n"); ret = 0; } } if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { - printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n", + printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n", csum_type); ret = 1; } @@ -464,13 +466,10 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { - struct extent_io_tree *tree; u64 start = page_offset(page); u64 found_start; struct extent_buffer *eb; - tree = &BTRFS_I(page->mapping->host)->io_tree; - eb = (struct extent_buffer *)page->private; if (page != eb->pages[0]) return 0; @@ -500,8 +499,8 @@ static int check_tree_block_fsid(struct btrfs_root *root, } #define CORRUPT(reason, eb, root, slot) \ - printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \ - "root=%llu, slot=%d\n", reason, \ + btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu," \ + "root=%llu, slot=%d", reason, \ btrfs_header_bytenr(eb), root->objectid, slot) static noinline int check_leaf(struct btrfs_root *root, @@ -569,7 +568,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) { - struct extent_io_tree *tree; u64 found_start; int found_level; struct extent_buffer *eb; @@ -580,7 +578,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, if (!page->private) goto out; - tree = &BTRFS_I(page->mapping->host)->io_tree; eb = (struct extent_buffer *)page->private; /* the pending IO might have been the only thing that kept this buffer @@ -600,21 +597,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_INFO "btrfs bad tree block start " + printk_ratelimited(KERN_INFO "BTRFS: bad tree block start " "%llu %llu\n", found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root, eb)) { - printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", + printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n", eb->start); ret = -EIO; goto err; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_info(root->fs_info, "bad tree block level %d\n", + btrfs_info(root->fs_info, "bad tree block level %d", (int)btrfs_header_level(eb)); ret = -EIO; goto err; @@ -842,20 +839,17 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, static int btree_csum_one_bio(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; - int bio_index = 0; + struct bio_vec *bvec; struct btrfs_root *root; - int ret = 0; + int i, ret = 0; - WARN_ON(bio->bi_vcnt <= 0); - while (bio_index < bio->bi_vcnt) { + bio_for_each_segment_all(bvec, bio, i) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; ret = csum_dirty_buffer(root, bvec->bv_page); if (ret) break; - bio_index++; - bvec++; } + return ret; } @@ -967,11 +961,9 @@ static int btree_migratepage(struct address_space *mapping, static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_io_tree *tree; struct btrfs_fs_info *fs_info; int ret; - tree = &BTRFS_I(mapping->host)->io_tree; if (wbc->sync_mode == WB_SYNC_NONE) { if (wbc->for_kupdate) @@ -1010,8 +1002,9 @@ static void btree_invalidatepage(struct page *page, unsigned int offset, extent_invalidatepage(tree, page, offset); btree_releasepage(page, GFP_NOFS); if (PagePrivate(page)) { - printk(KERN_WARNING "btrfs warning page private not zero " - "on page %llu\n", (unsigned long long)page_offset(page)); + btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, + "page private not zero on page %llu", + (unsigned long long)page_offset(page)); ClearPagePrivate(page); set_page_private(page, 0); page_cache_release(page); @@ -1095,21 +1088,13 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_buffer *eb; - eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, bytenr); - return eb; + return find_extent_buffer(root->fs_info, bytenr); } struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_buffer *eb; - - eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize); - return eb; + return alloc_extent_buffer(root->fs_info, bytenr, blocksize); } @@ -1273,7 +1258,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root; struct btrfs_key key; int ret = 0; - u64 bytenr; uuid_le uuid; root = btrfs_alloc_root(fs_info); @@ -1295,7 +1279,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, goto fail; } - bytenr = leaf->start; memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); @@ -1616,7 +1599,8 @@ again: if (ret) goto fail; - ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); + ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID, + location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL); if (ret < 0) goto fail; if (ret == 0) @@ -1684,18 +1668,16 @@ static void end_workqueue_fn(struct btrfs_work *work) { struct bio *bio; struct end_io_wq *end_io_wq; - struct btrfs_fs_info *fs_info; int error; end_io_wq = container_of(work, struct end_io_wq, work); bio = end_io_wq->bio; - fs_info = end_io_wq->info; error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kfree(end_io_wq); - bio_endio(bio, error); + bio_endio_nodec(bio, error); } static int cleaner_kthread(void *arg) @@ -2080,6 +2062,12 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) btrfs_drop_and_free_fs_root(fs_info, gang[i]); } + + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + btrfs_free_log_root_tree(NULL, fs_info); + btrfs_destroy_pinned_extent(fs_info->tree_root, + fs_info->pinned_extents); + } } int open_ctree(struct super_block *sb, @@ -2154,6 +2142,7 @@ int open_ctree(struct super_block *sb, mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -2167,6 +2156,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->free_chunk_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->buffer_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); seqlock_init(&fs_info->profiles_lock); @@ -2198,7 +2188,7 @@ int open_ctree(struct super_block *sb, fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; - + fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64); /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); spin_lock_init(&fs_info->reada_lock); @@ -2337,7 +2327,7 @@ int open_ctree(struct super_block *sb, * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ if (btrfs_check_super_csum(bh->b_data)) { - printk(KERN_ERR "btrfs: superblock checksum mismatch\n"); + printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); err = -EINVAL; goto fail_alloc; } @@ -2356,7 +2346,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); if (ret) { - printk(KERN_ERR "btrfs: superblock contains fatal errors\n"); + printk(KERN_ERR "BTRFS: superblock contains fatal errors\n"); err = -EINVAL; goto fail_alloc; } @@ -2421,7 +2411,7 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - printk(KERN_ERR "btrfs: has skinny extents\n"); + printk(KERN_ERR "BTRFS: has skinny extents\n"); /* * flag our filesystem as having big metadata blocks if @@ -2429,7 +2419,7 @@ int open_ctree(struct super_block *sb, */ if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - printk(KERN_INFO "btrfs flagging fs with big metadata feature\n"); + printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; } @@ -2446,7 +2436,7 @@ int open_ctree(struct super_block *sb, */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (sectorsize != leafsize)) { - printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes " + printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " "are not allowed for mixed block groups on %s\n", sb->s_id); goto fail_alloc; @@ -2583,12 +2573,12 @@ int open_ctree(struct super_block *sb, sb->s_blocksize_bits = blksize_bits(sectorsize); if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); + printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id); goto fail_sb_buffer; } if (sectorsize != PAGE_SIZE) { - printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) " + printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) " "found on %s\n", (unsigned long)sectorsize, sb->s_id); goto fail_sb_buffer; } @@ -2597,7 +2587,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_WARNING "btrfs: failed to read the system " + printk(KERN_WARNING "BTRFS: failed to read the system " "array on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -2614,7 +2604,7 @@ int open_ctree(struct super_block *sb, blocksize, generation); if (!chunk_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { - printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", + printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2626,7 +2616,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_chunk_tree(chunk_root); if (ret) { - printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", + printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2638,7 +2628,7 @@ int open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_info, fs_devices, 0); if (!fs_devices->latest_bdev) { - printk(KERN_CRIT "btrfs: failed to read devices on %s\n", + printk(KERN_CRIT "BTRFS: failed to read devices on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2653,7 +2643,7 @@ retry_root_backup: blocksize, generation); if (!tree_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { - printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", + printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); goto recovery_tree_root; @@ -2724,50 +2714,56 @@ retry_root_backup: ret = btrfs_recover_balance(fs_info); if (ret) { - printk(KERN_WARNING "btrfs: failed to recover balance\n"); + printk(KERN_WARNING "BTRFS: failed to recover balance\n"); goto fail_block_groups; } ret = btrfs_init_dev_stats(fs_info); if (ret) { - printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n", + printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n", ret); goto fail_block_groups; } ret = btrfs_init_dev_replace(fs_info); if (ret) { - pr_err("btrfs: failed to init dev_replace: %d\n", ret); + pr_err("BTRFS: failed to init dev_replace: %d\n", ret); goto fail_block_groups; } btrfs_close_extra_devices(fs_info, fs_devices, 1); - ret = btrfs_init_space_info(fs_info); + ret = btrfs_sysfs_add_one(fs_info); if (ret) { - printk(KERN_ERR "Failed to initial space info: %d\n", ret); + pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); goto fail_block_groups; } + ret = btrfs_init_space_info(fs_info); + if (ret) { + printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret); + goto fail_sysfs; + } + ret = btrfs_read_block_groups(extent_root); if (ret) { - printk(KERN_ERR "Failed to read block groups: %d\n", ret); - goto fail_block_groups; + printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); + goto fail_sysfs; } fs_info->num_tolerated_disk_barrier_failures = btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(sb->s_flags & MS_RDONLY)) { - printk(KERN_WARNING - "Btrfs: too many missing devices, writeable mount is not allowed\n"); - goto fail_block_groups; + printk(KERN_WARNING "BTRFS: " + "too many missing devices, writeable mount is not allowed\n"); + goto fail_sysfs; } fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) - goto fail_block_groups; + goto fail_sysfs; fs_info->transaction_kthread = kthread_run(transaction_kthread, tree_root, @@ -2778,11 +2774,15 @@ retry_root_backup: if (!btrfs_test_opt(tree_root, SSD) && !btrfs_test_opt(tree_root, NOSSD) && !fs_info->fs_devices->rotating) { - printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD " + printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD " "mode\n"); btrfs_set_opt(fs_info->mount_opt, SSD); } + /* Set the real inode map cache flag */ + if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE)) + btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE); + #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { ret = btrfsic_mount(tree_root, fs_devices, @@ -2791,7 +2791,7 @@ retry_root_backup: 1 : 0, fs_info->check_integrity_print_mask); if (ret) - printk(KERN_WARNING "btrfs: failed to initialize" + printk(KERN_WARNING "BTRFS: failed to initialize" " integrity check module %s\n", sb->s_id); } #endif @@ -2804,7 +2804,7 @@ retry_root_backup: u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { - printk(KERN_WARNING "Btrfs log replay required " + printk(KERN_WARNING "BTRFS: log replay required " "on RO media\n"); err = -EIO; goto fail_qgroup; @@ -2827,7 +2827,7 @@ retry_root_backup: generation + 1); if (!log_tree_root->node || !extent_buffer_uptodate(log_tree_root->node)) { - printk(KERN_ERR "btrfs: failed to read log tree\n"); + printk(KERN_ERR "BTRFS: failed to read log tree\n"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); goto fail_trans_kthread; @@ -2861,7 +2861,7 @@ retry_root_backup: ret = btrfs_recover_relocation(tree_root); if (ret < 0) { printk(KERN_WARNING - "btrfs: failed to recover relocation\n"); + "BTRFS: failed to recover relocation\n"); err = -EINVAL; goto fail_qgroup; } @@ -2891,14 +2891,14 @@ retry_root_backup: ret = btrfs_resume_balance_async(fs_info); if (ret) { - printk(KERN_WARNING "btrfs: failed to resume balance\n"); + printk(KERN_WARNING "BTRFS: failed to resume balance\n"); close_ctree(tree_root); return ret; } ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { - pr_warn("btrfs: failed to resume dev_replace\n"); + pr_warn("BTRFS: failed to resume dev_replace\n"); close_ctree(tree_root); return ret; } @@ -2906,20 +2906,20 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); if (create_uuid_tree) { - pr_info("btrfs: creating UUID tree\n"); + pr_info("BTRFS: creating UUID tree\n"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { - pr_warn("btrfs: failed to create the UUID tree %d\n", + pr_warn("BTRFS: failed to create the UUID tree %d\n", ret); close_ctree(tree_root); return ret; } } else if (check_uuid_tree || btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) { - pr_info("btrfs: checking UUID tree\n"); + pr_info("BTRFS: checking UUID tree\n"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { - pr_warn("btrfs: failed to check the UUID tree %d\n", + pr_warn("BTRFS: failed to check the UUID tree %d\n", ret); close_ctree(tree_root); return ret; @@ -2945,6 +2945,9 @@ fail_cleaner: */ filemap_write_and_wait(fs_info->btree_inode->i_mapping); +fail_sysfs: + btrfs_sysfs_remove_one(fs_info); + fail_block_groups: btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); @@ -3000,7 +3003,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) struct btrfs_device *device = (struct btrfs_device *) bh->b_private; - printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to " + printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to " "I/O error on %s\n", rcu_str_deref(device->name)); /* note, we dont' set_buffer_write_io_error because we have @@ -3119,7 +3122,7 @@ static int write_dev_supers(struct btrfs_device *device, bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); if (!bh) { - printk(KERN_ERR "btrfs: couldn't get super " + printk(KERN_ERR "BTRFS: couldn't get super " "buffer head for bytenr %Lu\n", bytenr); errors++; continue; @@ -3140,7 +3143,10 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - ret = btrfsic_submit_bh(WRITE_FUA, bh); + if (i == 0) + ret = btrfsic_submit_bh(WRITE_FUA, bh); + else + ret = btrfsic_submit_bh(WRITE_SYNC, bh); if (ret) errors++; } @@ -3186,7 +3192,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - printk_in_rcu("btrfs: disabling barriers on dev %s\n", + printk_in_rcu("BTRFS: disabling barriers on dev %s\n", rcu_str_deref(device->name)); device->nobarriers = 1; } else if (!bio_flagged(bio, BIO_UPTODATE)) { @@ -3407,7 +3413,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) total_errors++; } if (total_errors > max_errors) { - printk(KERN_ERR "btrfs: %d errors while writing supers\n", + btrfs_err(root->fs_info, "%d errors while writing supers", total_errors); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); @@ -3455,10 +3461,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_free_log(NULL, root); - btrfs_free_log_root_tree(NULL, fs_info); - } __btrfs_remove_free_space_cache(root->free_ino_pinned); __btrfs_remove_free_space_cache(root->free_ino_ctl); @@ -3563,14 +3567,12 @@ int close_ctree(struct btrfs_root *root) if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) - printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + btrfs_err(root->fs_info, "commit super ret %d", ret); } if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_error_commit_super(root); - btrfs_put_block_group_cache(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -3580,12 +3582,16 @@ int close_ctree(struct btrfs_root *root) btrfs_free_qgroup_config(root->fs_info); if (percpu_counter_sum(&fs_info->delalloc_bytes)) { - printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n", + btrfs_info(root->fs_info, "at unmount delalloc count %lld", percpu_counter_sum(&fs_info->delalloc_bytes)); } + btrfs_sysfs_remove_one(fs_info); + del_fs_roots(fs_info); + btrfs_put_block_group_cache(fs_info); + btrfs_free_block_groups(fs_info); btrfs_stop_all_workers(fs_info); @@ -3803,55 +3809,54 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, delayed_refs = &trans->delayed_refs; spin_lock(&delayed_refs->lock); - if (delayed_refs->num_entries == 0) { + if (atomic_read(&delayed_refs->num_entries) == 0) { spin_unlock(&delayed_refs->lock); - printk(KERN_INFO "delayed_refs has NO entry\n"); + btrfs_info(root->fs_info, "delayed_refs has NO entry"); return ret; } - while ((node = rb_first(&delayed_refs->root)) != NULL) { - struct btrfs_delayed_ref_head *head = NULL; + while ((node = rb_first(&delayed_refs->href_root)) != NULL) { + struct btrfs_delayed_ref_head *head; bool pin_bytes = false; - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - atomic_set(&ref->refs, 1); - if (btrfs_delayed_ref_is_head(ref)) { - - head = btrfs_delayed_node_to_head(ref); - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&ref->refs); - spin_unlock(&delayed_refs->lock); - - /* Need to wait for the delayed ref to run */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - - spin_lock(&delayed_refs->lock); - continue; - } + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); - if (head->must_insert_reserved) - pin_bytes = true; - btrfs_free_delayed_extent_op(head->extent_op); - delayed_refs->num_heads--; - if (list_empty(&head->cluster)) - delayed_refs->num_heads_ready--; - list_del_init(&head->cluster); - } - - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; - spin_unlock(&delayed_refs->lock); - if (head) { - if (pin_bytes) - btrfs_pin_extent(root, ref->bytenr, - ref->num_bytes, 1); + mutex_lock(&head->mutex); mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + spin_lock(&delayed_refs->lock); + continue; + } + spin_lock(&head->lock); + while ((node = rb_first(&head->ref_root)) != NULL) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + ref->in_tree = 0; + rb_erase(&ref->rb_node, &head->ref_root); + atomic_dec(&delayed_refs->num_entries); + btrfs_put_delayed_ref(ref); } - btrfs_put_delayed_ref(ref); + if (head->must_insert_reserved) + pin_bytes = true; + btrfs_free_delayed_extent_op(head->extent_op); + delayed_refs->num_heads--; + if (head->processing == 0) + delayed_refs->num_heads_ready--; + atomic_dec(&delayed_refs->num_entries); + head->node.in_tree = 0; + rb_erase(&head->href_node, &delayed_refs->href_root); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + mutex_unlock(&head->mutex); + if (pin_bytes) + btrfs_pin_extent(root, head->node.bytenr, + head->node.num_bytes, 1); + btrfs_put_delayed_ref(&head->node); cond_resched(); spin_lock(&delayed_refs->lock); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9c01509dd8ab..32312e09f0f5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -35,6 +35,7 @@ #include "locking.h" #include "free-space-cache.h" #include "math.h" +#include "sysfs.h" #undef SCRAMBLE_DELAYED_REFS @@ -441,7 +442,8 @@ next: if (ret) break; - if (need_resched()) { + if (need_resched() || + rwsem_is_contended(&fs_info->extent_commit_sem)) { caching_ctl->progress = last; btrfs_release_path(path); up_read(&fs_info->extent_commit_sem); @@ -855,12 +857,14 @@ again: btrfs_put_delayed_ref(&head->node); goto search_again; } + spin_lock(&head->lock); if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; else BUG_ON(num_refs == 0); num_refs += head->node.ref_mod; + spin_unlock(&head->lock); mutex_unlock(&head->mutex); } spin_unlock(&delayed_refs->lock); @@ -1070,11 +1074,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -2285,64 +2289,62 @@ static noinline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct rb_node *node; - struct btrfs_delayed_ref_node *ref; - int action = BTRFS_ADD_DELAYED_REF; -again: + struct btrfs_delayed_ref_node *ref, *last = NULL;; + /* * select delayed ref of type BTRFS_ADD_DELAYED_REF first. * this prevents ref count from going down to zero when * there still are pending delayed ref. */ - node = rb_prev(&head->node.rb_node); - while (1) { - if (!node) - break; + node = rb_first(&head->ref_root); + while (node) { ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr != head->node.bytenr) - break; - if (ref->action == action) + if (ref->action == BTRFS_ADD_DELAYED_REF) return ref; - node = rb_prev(node); - } - if (action == BTRFS_ADD_DELAYED_REF) { - action = BTRFS_DROP_DELAYED_REF; - goto again; + else if (last == NULL) + last = ref; + node = rb_next(node); } - return NULL; + return last; } /* * Returns 0 on success or if called with an already aborted transaction. * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ -static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct list_head *cluster) +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + unsigned long nr) { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; struct btrfs_fs_info *fs_info = root->fs_info; + ktime_t start = ktime_get(); int ret; - int count = 0; + unsigned long count = 0; + unsigned long actual_count = 0; int must_insert_reserved = 0; delayed_refs = &trans->transaction->delayed_refs; while (1) { if (!locked_ref) { - /* pick a new head ref from the cluster list */ - if (list_empty(cluster)) + if (count >= nr) break; - locked_ref = list_entry(cluster->next, - struct btrfs_delayed_ref_head, cluster); + spin_lock(&delayed_refs->lock); + locked_ref = btrfs_select_ref_head(trans); + if (!locked_ref) { + spin_unlock(&delayed_refs->lock); + break; + } /* grab the lock that says we are going to process * all the refs for this head */ ret = btrfs_delayed_ref_lock(trans, locked_ref); - + spin_unlock(&delayed_refs->lock); /* * we may have dropped the spin lock to get the head * mutex lock, and that might have given someone else @@ -2363,6 +2365,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * finish. If we merged anything we need to re-loop so we can * get a good ref. */ + spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, locked_ref); @@ -2374,17 +2377,15 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, if (ref && ref->seq && btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { - /* - * there are still refs with lower seq numbers in the - * process of being added. Don't run this ref yet. - */ - list_del_init(&locked_ref->cluster); + spin_unlock(&locked_ref->lock); btrfs_delayed_ref_unlock(locked_ref); - locked_ref = NULL; + spin_lock(&delayed_refs->lock); + locked_ref->processing = 0; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); + locked_ref = NULL; cond_resched(); - spin_lock(&delayed_refs->lock); + count++; continue; } @@ -2399,6 +2400,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, locked_ref->extent_op = NULL; if (!ref) { + + /* All delayed refs have been processed, Go ahead * and send the head node to run_one_delayed_ref, * so that any accounting fixes can happen @@ -2411,8 +2414,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } if (extent_op) { - spin_unlock(&delayed_refs->lock); - + spin_unlock(&locked_ref->lock); ret = run_delayed_extent_op(trans, root, ref, extent_op); btrfs_free_delayed_extent_op(extent_op); @@ -2426,19 +2428,39 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, */ if (must_insert_reserved) locked_ref->must_insert_reserved = 1; + locked_ref->processing = 0; btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); - spin_lock(&delayed_refs->lock); btrfs_delayed_ref_unlock(locked_ref); return ret; } + continue; + } - goto next; + /* + * Need to drop our head ref lock and re-aqcuire the + * delayed ref lock and then re-check to make sure + * nobody got added. + */ + spin_unlock(&locked_ref->lock); + spin_lock(&delayed_refs->lock); + spin_lock(&locked_ref->lock); + if (rb_first(&locked_ref->ref_root)) { + spin_unlock(&locked_ref->lock); + spin_unlock(&delayed_refs->lock); + continue; } + ref->in_tree = 0; + delayed_refs->num_heads--; + rb_erase(&locked_ref->href_node, + &delayed_refs->href_root); + spin_unlock(&delayed_refs->lock); + } else { + actual_count++; + ref->in_tree = 0; + rb_erase(&ref->rb_node, &locked_ref->ref_root); } + atomic_dec(&delayed_refs->num_entries); - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; if (!btrfs_delayed_ref_is_head(ref)) { /* * when we play the delayed ref, also correct the @@ -2455,20 +2477,18 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, default: WARN_ON(1); } - } else { - list_del_init(&locked_ref->cluster); } - spin_unlock(&delayed_refs->lock); + spin_unlock(&locked_ref->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); btrfs_free_delayed_extent_op(extent_op); if (ret) { + locked_ref->processing = 0; btrfs_delayed_ref_unlock(locked_ref); btrfs_put_delayed_ref(ref); btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); - spin_lock(&delayed_refs->lock); return ret; } @@ -2484,11 +2504,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } btrfs_put_delayed_ref(ref); count++; -next: cond_resched(); + } + + /* + * We don't want to include ref heads since we can have empty ref heads + * and those will drastically skew our runtime down since we just do + * accounting, no actual extent tree updates. + */ + if (actual_count > 0) { + u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); + u64 avg; + + /* + * We weigh the current average higher than our current runtime + * to avoid large swings in the average. + */ spin_lock(&delayed_refs->lock); + avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; + avg = div64_u64(avg, 4); + fs_info->avg_delayed_ref_runtime = avg; + spin_unlock(&delayed_refs->lock); } - return count; + return 0; } #ifdef SCRAMBLE_DELAYED_REFS @@ -2570,16 +2608,6 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, return ret; } -static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, - int count) -{ - int val = atomic_read(&delayed_refs->ref_seq); - - if (val < seq || val >= seq + count) - return 1; - return 0; -} - static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) { u64 num_bytes; @@ -2596,7 +2624,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_rsv *global_rsv; @@ -2625,6 +2653,22 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, return ret; } +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 num_entries = + atomic_read(&trans->transaction->delayed_refs.num_entries); + u64 avg_runtime; + + smp_mb(); + avg_runtime = fs_info->avg_delayed_ref_runtime; + if (num_entries * avg_runtime >= NSEC_PER_SEC) + return 1; + + return btrfs_check_space_for_delayed_refs(trans, root); +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2640,13 +2684,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct list_head cluster; + struct btrfs_delayed_ref_head *head; int ret; - u64 delayed_start; int run_all = count == (unsigned long)-1; int run_most = 0; - int loops; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2658,130 +2699,40 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); delayed_refs = &trans->transaction->delayed_refs; - INIT_LIST_HEAD(&cluster); if (count == 0) { - count = delayed_refs->num_entries * 2; + count = atomic_read(&delayed_refs->num_entries) * 2; run_most = 1; } - if (!run_all && !run_most) { - int old; - int seq = atomic_read(&delayed_refs->ref_seq); - -progress: - old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); - if (old) { - DEFINE_WAIT(__wait); - if (delayed_refs->flushing || - !btrfs_should_throttle_delayed_refs(trans, root)) - return 0; - - prepare_to_wait(&delayed_refs->wait, &__wait, - TASK_UNINTERRUPTIBLE); - - old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); - if (old) { - schedule(); - finish_wait(&delayed_refs->wait, &__wait); - - if (!refs_newer(delayed_refs, seq, 256)) - goto progress; - else - return 0; - } else { - finish_wait(&delayed_refs->wait, &__wait); - goto again; - } - } - - } else { - atomic_inc(&delayed_refs->procs_running_refs); - } - again: - loops = 0; - spin_lock(&delayed_refs->lock); - #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - - while (1) { - if (!(run_all || run_most) && - !btrfs_should_throttle_delayed_refs(trans, root)) - break; - - /* - * go find something we can process in the rbtree. We start at - * the beginning of the tree, and then build a cluster - * of refs to process starting at the first one we are able to - * lock - */ - delayed_start = delayed_refs->run_delayed_start; - ret = btrfs_find_ref_cluster(trans, &cluster, - delayed_refs->run_delayed_start); - if (ret) - break; - - ret = run_clustered_refs(trans, root, &cluster); - if (ret < 0) { - btrfs_release_ref_cluster(&cluster); - spin_unlock(&delayed_refs->lock); - btrfs_abort_transaction(trans, root, ret); - atomic_dec(&delayed_refs->procs_running_refs); - wake_up(&delayed_refs->wait); - return ret; - } - - atomic_add(ret, &delayed_refs->ref_seq); - - count -= min_t(unsigned long, ret, count); - - if (count == 0) - break; - - if (delayed_start >= delayed_refs->run_delayed_start) { - if (loops == 0) { - /* - * btrfs_find_ref_cluster looped. let's do one - * more cycle. if we don't run any delayed ref - * during that cycle (because we can't because - * all of them are blocked), bail out. - */ - loops = 1; - } else { - /* - * no runnable refs left, stop trying - */ - BUG_ON(run_all); - break; - } - } - if (ret) { - /* refs were run, let's reset staleness detection */ - loops = 0; - } + ret = __btrfs_run_delayed_refs(trans, root, count); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + return ret; } if (run_all) { - if (!list_empty(&trans->new_bgs)) { - spin_unlock(&delayed_refs->lock); + if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - spin_lock(&delayed_refs->lock); - } - node = rb_first(&delayed_refs->root); - if (!node) + spin_lock(&delayed_refs->lock); + node = rb_first(&delayed_refs->href_root); + if (!node) { + spin_unlock(&delayed_refs->lock); goto out; + } count = (unsigned long)-1; while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (btrfs_delayed_ref_is_head(&head->node)) { + struct btrfs_delayed_ref_node *ref; - head = btrfs_delayed_node_to_head(ref); + ref = &head->node; atomic_inc(&ref->refs); spin_unlock(&delayed_refs->lock); @@ -2795,20 +2746,16 @@ again: btrfs_put_delayed_ref(ref); cond_resched(); goto again; + } else { + WARN_ON(1); } node = rb_next(node); } spin_unlock(&delayed_refs->lock); - schedule_timeout(1); + cond_resched(); goto again; } out: - atomic_dec(&delayed_refs->procs_running_refs); - smp_mb(); - if (waitqueue_active(&delayed_refs->wait)) - wake_up(&delayed_refs->wait); - - spin_unlock(&delayed_refs->lock); assert_qgroups_uptodate(trans); return 0; } @@ -2850,12 +2797,13 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, struct rb_node *node; int ret = 0; - ret = -ENOENT; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); - if (!head) - goto out; + if (!head) { + spin_unlock(&delayed_refs->lock); + return 0; + } if (!mutex_trylock(&head->mutex)) { atomic_inc(&head->node.refs); @@ -2872,40 +2820,35 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(&head->node); return -EAGAIN; } + spin_unlock(&delayed_refs->lock); - node = rb_prev(&head->node.rb_node); - if (!node) - goto out_unlock; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - if (ref->bytenr != bytenr) - goto out_unlock; - - ret = 1; - if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) - goto out_unlock; + spin_lock(&head->lock); + node = rb_first(&head->ref_root); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); - data_ref = btrfs_delayed_node_to_data_ref(ref); + /* If it's a shared ref we know a cross reference exists */ + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { + ret = 1; + break; + } - node = rb_prev(node); - if (node) { - int seq = ref->seq; + data_ref = btrfs_delayed_node_to_data_ref(ref); - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr && ref->seq == seq) - goto out_unlock; + /* + * If our ref doesn't match the one we're currently looking at + * then we have a cross reference. + */ + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || + data_ref->offset != offset) { + ret = 1; + break; + } } - - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || data_ref->offset != offset) - goto out_unlock; - - ret = 0; -out_unlock: + spin_unlock(&head->lock); mutex_unlock(&head->mutex); -out: - spin_unlock(&delayed_refs->lock); return ret; } @@ -3402,6 +3345,23 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) return readonly; } +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + static int update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, struct btrfs_space_info **space_info) @@ -3439,8 +3399,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, return ret; } - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { INIT_LIST_HEAD(&found->block_groups[i]); + kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype); + } init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; @@ -3457,11 +3419,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->chunk_alloc = 0; found->flush = 0; init_waitqueue_head(&found->wait); + + ret = kobject_init_and_add(&found->kobj, &space_info_ktype, + info->space_info_kobj, "%s", + alloc_name(found->flags)); + if (ret) { + kfree(found); + return ret; + } + *space_info = found; list_add_rcu(&found->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = found; - return 0; + + return ret; } static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) @@ -4637,7 +4609,7 @@ void btrfs_block_rsv_release(struct btrfs_root *root, u64 num_bytes) { struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - if (global_rsv->full || global_rsv == block_rsv || + if (global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, @@ -5916,24 +5888,16 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct rb_node *node; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); if (!head) - goto out; + goto out_delayed_unlock; - node = rb_prev(&head->node.rb_node); - if (!node) - goto out; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - /* there are still entries for this ref, we can't drop it */ - if (ref->bytenr == bytenr) + spin_lock(&head->lock); + if (rb_first(&head->ref_root)) goto out; if (head->extent_op) { @@ -5955,19 +5919,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, * ahead and process it. */ head->node.in_tree = 0; - rb_erase(&head->node.rb_node, &delayed_refs->root); + rb_erase(&head->href_node, &delayed_refs->href_root); - delayed_refs->num_entries--; + atomic_dec(&delayed_refs->num_entries); /* * we don't take a ref on the node because we're removing it from the * tree, so we just steal the ref the tree was holding. */ delayed_refs->num_heads--; - if (list_empty(&head->cluster)) + if (head->processing == 0) delayed_refs->num_heads_ready--; - - list_del_init(&head->cluster); + head->processing = 0; + spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); BUG_ON(head->extent_op); @@ -5978,6 +5942,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(&head->node); return ret; out: + spin_unlock(&head->lock); + +out_delayed_unlock: spin_unlock(&delayed_refs->lock); return 0; } @@ -6145,11 +6112,29 @@ int __get_raid_index(u64 flags) return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } -static int get_block_group_index(struct btrfs_block_group_cache *cache) +int get_block_group_index(struct btrfs_block_group_cache *cache) { return __get_raid_index(cache->flags); } +static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = "raid10", + [BTRFS_RAID_RAID1] = "raid1", + [BTRFS_RAID_DUP] = "dup", + [BTRFS_RAID_RAID0] = "raid0", + [BTRFS_RAID_SINGLE] = "single", + [BTRFS_RAID_RAID5] = "raid5", + [BTRFS_RAID_RAID6] = "raid6", +}; + +static const char *get_raid_name(enum btrfs_raid_types type) +{ + if (type >= BTRFS_NR_RAID_TYPES) + return NULL; + + return btrfs_raid_type_names[type]; +} + enum btrfs_loop_type { LOOP_CACHING_NOWAIT = 0, LOOP_CACHING_WAIT = 1, @@ -6177,7 +6162,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, struct btrfs_root *root = orig_root->fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - struct btrfs_block_group_cache *used_block_group; u64 search_start = 0; u64 max_extent_size = 0; int empty_cluster = 2 * 1024 * 1024; @@ -6186,7 +6170,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, int index = __get_raid_index(flags); int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; - bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; @@ -6239,7 +6222,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); - used_block_group = block_group; /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -6276,7 +6258,6 @@ search: u64 offset; int cached; - used_block_group = block_group; btrfs_get_block_group(block_group); search_start = block_group->key.objectid; @@ -6304,7 +6285,6 @@ search: have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { - found_uncached_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; @@ -6320,6 +6300,7 @@ have_block_group: * lets look there */ if (last_ptr) { + struct btrfs_block_group_cache *used_block_group; unsigned long aligned_cluster; /* * the refill lock keeps out other @@ -6330,10 +6311,8 @@ have_block_group: if (used_block_group != block_group && (!used_block_group || used_block_group->ro || - !block_group_bits(used_block_group, flags))) { - used_block_group = block_group; + !block_group_bits(used_block_group, flags))) goto refill_cluster; - } if (used_block_group != block_group) btrfs_get_block_group(used_block_group); @@ -6347,17 +6326,19 @@ have_block_group: /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); trace_btrfs_reserve_extent_cluster(root, - block_group, search_start, num_bytes); + used_block_group, + search_start, num_bytes); + if (used_block_group != block_group) { + btrfs_put_block_group(block_group); + block_group = used_block_group; + } goto checks; } WARN_ON(last_ptr->block_group != used_block_group); - if (used_block_group != block_group) { + if (used_block_group != block_group) btrfs_put_block_group(used_block_group); - used_block_group = block_group; - } refill_cluster: - BUG_ON(used_block_group != block_group); /* If we are on LOOP_NO_EMPTY_SIZE, we can't * set up a new clusters, so lets just skip it * and let the allocator find whatever block @@ -6476,25 +6457,25 @@ unclustered_alloc: goto loop; } checks: - search_start = stripe_align(root, used_block_group, + search_start = stripe_align(root, block_group, offset, num_bytes); /* move on to the next group */ if (search_start + num_bytes > - used_block_group->key.objectid + used_block_group->key.offset) { - btrfs_add_free_space(used_block_group, offset, num_bytes); + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } if (offset < search_start) - btrfs_add_free_space(used_block_group, offset, + btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, + ret = btrfs_update_reserved_bytes(block_group, num_bytes, alloc_type); if (ret == -EAGAIN) { - btrfs_add_free_space(used_block_group, offset, num_bytes); + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } @@ -6504,16 +6485,12 @@ checks: trace_btrfs_reserve_extent(orig_root, block_group, search_start, num_bytes); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); break; loop: failed_cluster_refill = false; failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); @@ -6584,12 +6561,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int index = 0; spin_lock(&info->lock); - printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", + printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", info->flags, info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - info->bytes_readonly, (info->full) ? "" : "not "); - printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " + printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, @@ -6603,7 +6580,9 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); - printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", + printk(KERN_INFO "BTRFS: " + "block group %llu has %llu bytes, " + "%llu used %llu pinned %llu reserved %s\n", cache->key.objectid, cache->key.offset, btrfs_block_group_used(&cache->item), cache->pinned, cache->reserved, cache->ro ? "[readonly]" : ""); @@ -6966,7 +6945,7 @@ again: /*DEFAULT_RATELIMIT_BURST*/ 1); if (__ratelimit(&_rs)) WARN(1, KERN_DEBUG - "btrfs: block rsv returned %d\n", ret); + "BTRFS: block rsv returned %d\n", ret); } try_reserve: ret = reserve_metadata_bytes(root, block_rsv, blocksize, @@ -7714,7 +7693,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { - pr_debug("btrfs: drop snapshot early exit\n"); + pr_debug("BTRFS: drop snapshot early exit\n"); err = -EAGAIN; goto out_free; } @@ -7779,7 +7758,7 @@ out: */ if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); - if (err) + if (err && err != -EAGAIN) btrfs_std_error(root->fs_info, err); return err; } @@ -8333,6 +8312,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) release_global_block_rsv(info); while (!list_empty(&info->space_info)) { + int i; + space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); @@ -8343,9 +8324,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) dump_space_info(space_info, 0, 0); } } - percpu_counter_destroy(&space_info->total_bytes_pinned); list_del(&space_info->list); - kfree(space_info); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + struct kobject *kobj; + kobj = &space_info->block_group_kobjs[i]; + if (kobj->parent) { + kobject_del(kobj); + kobject_put(kobj); + } + } + kobject_del(&space_info->kobj); + kobject_put(&space_info->kobj); } return 0; } @@ -8356,10 +8345,57 @@ static void __link_block_group(struct btrfs_space_info *space_info, int index = get_block_group_index(cache); down_write(&space_info->groups_sem); + if (list_empty(&space_info->block_groups[index])) { + struct kobject *kobj = &space_info->block_group_kobjs[index]; + int ret; + + kobject_get(&space_info->kobj); /* put in release */ + ret = kobject_add(kobj, &space_info->kobj, "%s", + get_raid_name(index)); + if (ret) { + pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); + kobject_put(&space_info->kobj); + } + } list_add_tail(&cache->list, &space_info->block_groups[index]); up_write(&space_info->groups_sem); } +static struct btrfs_block_group_cache * +btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = start; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + + cache->sectorsize = root->sectorsize; + cache->fs_info = root->fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + start); + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); + btrfs_init_free_space_ctl(cache); + + return cache; +} + int btrfs_read_block_groups(struct btrfs_root *root) { struct btrfs_path *path; @@ -8395,26 +8431,16 @@ int btrfs_read_block_groups(struct btrfs_root *root) break; if (ret != 0) goto error; + leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - cache = kzalloc(sizeof(*cache), GFP_NOFS); + + cache = btrfs_create_block_group_cache(root, found_key.objectid, + found_key.offset); if (!cache) { ret = -ENOMEM; goto error; } - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - ret = -ENOMEM; - goto error; - } - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - cache->fs_info = info; - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); if (need_clear) { /* @@ -8435,16 +8461,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); - memcpy(&cache->key, &found_key, sizeof(found_key)); + cache->flags = btrfs_block_group_flags(&cache->item); key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(path); - cache->flags = btrfs_block_group_flags(&cache->item); - cache->sectorsize = root->sectorsize; - cache->full_stripe_len = btrfs_full_stripe_len(root, - &root->fs_info->mapping_tree, - found_key.objectid); - btrfs_init_free_space_ctl(cache); /* * We need to exclude the super stripes now so that the space @@ -8458,8 +8478,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) * case. */ free_excluded_extents(root, cache); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_put_block_group(cache); goto error; } @@ -8590,38 +8609,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, root->fs_info->last_trans_log_full_commit = trans->transid; - cache = kzalloc(sizeof(*cache), GFP_NOFS); + cache = btrfs_create_block_group_cache(root, chunk_offset, size); if (!cache) return -ENOMEM; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return -ENOMEM; - } - - cache->key.objectid = chunk_offset; - cache->key.offset = size; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = root->sectorsize; - cache->fs_info = root->fs_info; - cache->full_stripe_len = btrfs_full_stripe_len(root, - &root->fs_info->mapping_tree, - chunk_offset); - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); - - btrfs_init_free_space_ctl(cache); btrfs_set_block_group_used(&cache->item, bytes_used); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); - cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; ret = exclude_super_stripes(root, cache); @@ -8631,8 +8627,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * case. */ free_excluded_extents(root, cache); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_put_block_group(cache); return ret; } @@ -8796,8 +8791,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); - if (list_empty(&block_group->space_info->block_groups[index])) + if (list_empty(&block_group->space_info->block_groups[index])) { + kobject_del(&block_group->space_info->block_group_kobjs[index]); + kobject_put(&block_group->space_info->block_group_kobjs[index]); clear_avail_alloc_bits(root->fs_info, block_group->flags); + } up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ff43802a7c88..85bbd01f1271 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -59,7 +59,7 @@ void btrfs_leak_debug_check(void) while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); - printk(KERN_ERR "btrfs state leak: start %llu end %llu " + printk(KERN_ERR "BTRFS: state leak: start %llu end %llu " "state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs)); @@ -69,7 +69,7 @@ void btrfs_leak_debug_check(void) while (!list_empty(&buffers)) { eb = list_entry(buffers.next, struct extent_buffer, leak_list); - printk(KERN_ERR "btrfs buffer leak start %llu len %lu " + printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu " "refs %d\n", eb->start, eb->len, atomic_read(&eb->refs)); list_del(&eb->leak_list); @@ -77,16 +77,22 @@ void btrfs_leak_debug_check(void) } } -#define btrfs_debug_check_extent_io_range(inode, start, end) \ - __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end)) +#define btrfs_debug_check_extent_io_range(tree, start, end) \ + __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) static inline void __btrfs_debug_check_extent_io_range(const char *caller, - struct inode *inode, u64 start, u64 end) + struct extent_io_tree *tree, u64 start, u64 end) { - u64 isize = i_size_read(inode); + struct inode *inode; + u64 isize; + + if (!tree->mapping) + return; + inode = tree->mapping->host; + isize = i_size_read(inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { printk_ratelimited(KERN_DEBUG - "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n", + "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n", caller, btrfs_ino(inode), isize, start, end); } } @@ -124,6 +130,8 @@ static noinline void flush_write_bio(void *data); static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) { + if (!tree->mapping) + return NULL; return btrfs_sb(tree->mapping->host->i_sb); } @@ -186,11 +194,9 @@ void extent_io_tree_init(struct extent_io_tree *tree, struct address_space *mapping) { tree->state = RB_ROOT; - INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); - spin_lock_init(&tree->buffer_lock); tree->mapping = mapping; } @@ -224,12 +230,20 @@ void free_extent_state(struct extent_state *state) } static struct rb_node *tree_insert(struct rb_root *root, u64 offset, - struct rb_node *node) + struct rb_node *node, + struct rb_node ***p_in, + struct rb_node **parent_in) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct tree_entry *entry; + if (p_in && parent_in) { + p = *p_in; + parent = *parent_in; + goto do_insert; + } + while (*p) { parent = *p; entry = rb_entry(parent, struct tree_entry, rb_node); @@ -242,35 +256,43 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, return parent; } +do_insert: rb_link_node(node, parent, p); rb_insert_color(node, root); return NULL; } static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **prev_ret, - struct rb_node **next_ret) + struct rb_node **prev_ret, + struct rb_node **next_ret, + struct rb_node ***p_ret, + struct rb_node **parent_ret) { struct rb_root *root = &tree->state; - struct rb_node *n = root->rb_node; + struct rb_node **n = &root->rb_node; struct rb_node *prev = NULL; struct rb_node *orig_prev = NULL; struct tree_entry *entry; struct tree_entry *prev_entry = NULL; - while (n) { - entry = rb_entry(n, struct tree_entry, rb_node); - prev = n; + while (*n) { + prev = *n; + entry = rb_entry(prev, struct tree_entry, rb_node); prev_entry = entry; if (offset < entry->start) - n = n->rb_left; + n = &(*n)->rb_left; else if (offset > entry->end) - n = n->rb_right; + n = &(*n)->rb_right; else - return n; + return *n; } + if (p_ret) + *p_ret = n; + if (parent_ret) + *parent_ret = prev; + if (prev_ret) { orig_prev = prev; while (prev && offset > prev_entry->end) { @@ -292,18 +314,27 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, return NULL; } -static inline struct rb_node *tree_search(struct extent_io_tree *tree, - u64 offset) +static inline struct rb_node * +tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***p_ret, + struct rb_node **parent_ret) { struct rb_node *prev = NULL; struct rb_node *ret; - ret = __etree_search(tree, offset, &prev, NULL); + ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); if (!ret) return prev; return ret; } +static inline struct rb_node *tree_search(struct extent_io_tree *tree, + u64 offset) +{ + return tree_search_for_insert(tree, offset, NULL, NULL); +} + static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, struct extent_state *other) { @@ -385,23 +416,25 @@ static void set_state_bits(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, + struct rb_node ***p, + struct rb_node **parent, unsigned long *bits) { struct rb_node *node; if (end < start) - WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", + WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", end, start); state->start = start; state->end = end; set_state_bits(tree, state, bits); - node = tree_insert(&tree->state, end, &state->rb_node); + node = tree_insert(&tree->state, end, &state->rb_node, p, parent); if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); - printk(KERN_ERR "btrfs found node %llu %llu on insert of " + printk(KERN_ERR "BTRFS: found node %llu %llu on insert of " "%llu %llu\n", found->start, found->end, start, end); return -EEXIST; @@ -444,7 +477,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, prealloc->state = orig->state; orig->start = split; - node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); + node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node, + NULL, NULL); if (node) { free_extent_state(prealloc); return -EEXIST; @@ -542,7 +576,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err; int clear = 0; - btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + btrfs_debug_check_extent_io_range(tree, start, end); if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; @@ -702,7 +736,7 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct rb_node *node; - btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + btrfs_debug_check_extent_io_range(tree, start, end); spin_lock(&tree->lock); again: @@ -783,11 +817,13 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + struct rb_node **p; + struct rb_node *parent; int err = 0; u64 last_start; u64 last_end; - btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + btrfs_debug_check_extent_io_range(tree, start, end); bits |= EXTENT_FIRST_DELALLOC; again: @@ -809,14 +845,16 @@ again: * this search will find all the extents that end after * our range starts. */ - node = tree_search(tree, start); + node = tree_search_for_insert(tree, start, &p, &parent); if (!node) { prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); - err = insert_state(tree, prealloc, start, end, &bits); + err = insert_state(tree, prealloc, start, end, + &p, &parent, &bits); if (err) extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; goto out; } @@ -919,7 +957,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - &bits); + NULL, NULL, &bits); if (err) extent_io_tree_panic(tree, err); @@ -1005,11 +1043,13 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + struct rb_node **p; + struct rb_node *parent; int err = 0; u64 last_start; u64 last_end; - btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + btrfs_debug_check_extent_io_range(tree, start, end); again: if (!prealloc && (mask & __GFP_WAIT)) { @@ -1032,17 +1072,19 @@ again: * this search will find all the extents that end after * our range starts. */ - node = tree_search(tree, start); + node = tree_search_for_insert(tree, start, &p, &parent); if (!node) { prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { err = -ENOMEM; goto out; } - err = insert_state(tree, prealloc, start, end, &bits); - prealloc = NULL; + err = insert_state(tree, prealloc, start, end, + &p, &parent, &bits); if (err) extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); + prealloc = NULL; goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -1135,7 +1177,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - &bits); + NULL, NULL, &bits); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1984,7 +2026,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; map_length = length; ret = btrfs_map_block(fs_info, WRITE, logical, @@ -1995,7 +2037,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, } BUG_ON(mirror_num != bbio->mirror_num); sector = bbio->stripes[mirror_num-1].physical >> 9; - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; dev = bbio->stripes[mirror_num-1].dev; kfree(bbio); if (!dev || !dev->bdev || !dev->writeable) { @@ -2012,9 +2054,10 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, return -EIO; } - printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " - "(dev %s sector %llu)\n", page->mapping->host->i_ino, - start, rcu_str_deref(dev->name), sector); + printk_ratelimited_in_rcu(KERN_INFO + "BTRFS: read error corrected: ino %lu off %llu " + "(dev %s sector %llu)\n", page->mapping->host->i_ino, + start, rcu_str_deref(dev->name), sector); bio_put(bio); return 0; @@ -2156,7 +2199,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, return -EIO; } - if (em->start > start || em->start + em->len < start) { + if (em->start > start || em->start + em->len <= start) { free_extent_map(em); em = NULL; } @@ -2268,9 +2311,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, return -EIO; } bio->bi_end_io = failed_bio->bi_end_io; - bio->bi_sector = failrec->logical >> 9; + bio->bi_iter.bi_sector = failrec->logical >> 9; bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; btrfs_failed_bio = btrfs_io_bio(failed_bio); if (btrfs_failed_bio->csum) { @@ -2332,37 +2375,39 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) */ static void end_bio_extent_writepage(struct bio *bio, int err) { - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_io_tree *tree; + struct bio_vec *bvec; u64 start; u64 end; + int i; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - tree = &BTRFS_I(page->mapping->host)->io_tree; /* We always issue full-page reads, but if some block * in a page fails to read, blk_update_request() will * advance bv_offset and adjust bv_len to compensate. * Print a warning for nonzero offsets, and an error * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) - printk("%s page write in btrfs with offset %u and length %u\n", - bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE - ? KERN_ERR "partial" : KERN_INFO "incomplete", - bvec->bv_offset, bvec->bv_len); + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, + "partial page write in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else + btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, + "incomplete page write in btrfs with offset %u and " + "length %u", + bvec->bv_offset, bvec->bv_len); + } start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (end_extent_writepage(page, err, start, end)) continue; end_page_writeback(page); - } while (bvec >= bio->bi_io_vec); + } bio_put(bio); } @@ -2392,9 +2437,8 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, */ static void end_bio_extent_readpage(struct bio *bio, int err) { + struct bio_vec *bvec; int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_vec *bvec = bio->bi_io_vec; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree; u64 offset = 0; @@ -2405,16 +2449,17 @@ static void end_bio_extent_readpage(struct bio *bio, int err) u64 extent_len = 0; int mirror; int ret; + int i; if (err) uptodate = 0; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " - "mirror=%lu\n", (u64)bio->bi_sector, err, + "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; @@ -2423,19 +2468,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err) * advance bv_offset and adjust bv_len to compensate. * Print a warning for nonzero offsets, and an error * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) - printk("%s page read in btrfs with offset %u and length %u\n", - bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE - ? KERN_ERR "partial" : KERN_INFO "incomplete", - bvec->bv_offset, bvec->bv_len); + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, + "partial page read in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else + btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, + "incomplete page read in btrfs with offset %u and " + "length %u", + bvec->bv_offset, bvec->bv_len); + } start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; len = bvec->bv_len; - if (++bvec <= bvec_end) - prefetchw(&bvec->bv_page->flags); - mirror = io_bio->mirror_num; if (likely(uptodate && tree->ops && tree->ops->readpage_end_io_hook)) { @@ -2516,7 +2564,7 @@ readpage_ok: extent_start = start; extent_len = end + 1 - start; } - } while (bvec <= bvec_end); + } if (extent_len) endio_readpage_release_extent(tree, extent_start, extent_len, @@ -2547,9 +2595,8 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, } if (bio) { - bio->bi_size = 0; bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + bio->bi_iter.bi_sector = first_sector; btrfs_bio = btrfs_io_bio(bio); btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; @@ -2643,7 +2690,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (bio_ret && *bio_ret) { bio = *bio_ret; if (old_compressed) - contig = bio->bi_sector == sector; + contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; @@ -3287,8 +3334,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { - printk(KERN_ERR "btrfs warning page %lu not " - "writeback, cur %llu end %llu\n", + btrfs_err(BTRFS_I(inode)->root->fs_info, + "page %lu not writeback, cur %llu end %llu", page->index, cur, end); } @@ -3410,20 +3457,18 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) static void end_bio_extent_buffer_writepage(struct bio *bio, int err) { - int uptodate = err == 0; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec; struct extent_buffer *eb; - int done; + int i, done; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - bvec--; eb = (struct extent_buffer *)page->private; BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); ClearPageUptodate(page); SetPageError(page); @@ -3435,10 +3480,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err) continue; end_extent_buffer_writeback(eb); - } while (bvec >= bio->bi_io_vec); + } bio_put(bio); - } static int write_one_eb(struct extent_buffer *eb, @@ -3447,6 +3491,7 @@ static int write_one_eb(struct extent_buffer *eb, struct extent_page_data *epd) { struct block_device *bdev = fs_info->fs_devices->latest_bdev; + struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; u64 offset = eb->start; unsigned long i, num_pages; unsigned long bio_flags = 0; @@ -3464,7 +3509,7 @@ static int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(rw, eb->tree, p, offset >> 9, + ret = submit_extent_page(rw, tree, p, offset >> 9, PAGE_CACHE_SIZE, 0, bdev, &epd->bio, -1, end_bio_extent_buffer_writepage, 0, epd->bio_flags, bio_flags); @@ -4082,12 +4127,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_file_extent_item *item; int end = 0; u64 em_start = 0; u64 em_len = 0; u64 em_end = 0; - unsigned long emflags; if (len == 0) return -EINVAL; @@ -4112,8 +4155,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } WARN_ON(!ret); path->slots[0]--; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); @@ -4181,7 +4222,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, offset_in_extent = em_start - em->start; em_end = extent_map_end(em); em_len = em_end - em_start; - emflags = em->flags; disko = 0; flags = 0; @@ -4333,10 +4373,9 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) __free_extent_buffer(eb); } -static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, - unsigned long len, - gfp_t mask) +static struct extent_buffer * +__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, + unsigned long len, gfp_t mask) { struct extent_buffer *eb = NULL; @@ -4345,7 +4384,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; - eb->tree = tree; + eb->fs_info = fs_info; eb->bflags = 0; rwlock_init(&eb->lock); atomic_set(&eb->write_locks, 0); @@ -4477,13 +4516,14 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb) } } -struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, - u64 start) +struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb; rcu_read_lock(); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); mark_extent_buffer_accessed(eb); @@ -4494,7 +4534,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, return NULL; } -struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, +struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len) { unsigned long num_pages = num_extent_pages(start, len); @@ -4503,16 +4543,15 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *eb; struct extent_buffer *exists = NULL; struct page *p; - struct address_space *mapping = tree->mapping; + struct address_space *mapping = fs_info->btree_inode->i_mapping; int uptodate = 1; int ret; - - eb = find_extent_buffer(tree, start); + eb = find_extent_buffer(fs_info, start); if (eb) return eb; - eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); + eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS); if (!eb) return NULL; @@ -4567,12 +4606,13 @@ again: if (ret) goto free_eb; - spin_lock(&tree->buffer_lock); - ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); - spin_unlock(&tree->buffer_lock); + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { - exists = find_extent_buffer(tree, start); + exists = find_extent_buffer(fs_info, start); if (exists) goto free_eb; else @@ -4580,6 +4620,7 @@ again: } /* add one reference for the tree */ check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); /* * there is a race where release page may have @@ -4623,17 +4664,17 @@ static int release_extent_buffer(struct extent_buffer *eb) { WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { - if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { - spin_unlock(&eb->refs_lock); - } else { - struct extent_io_tree *tree = eb->tree; + if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { + struct btrfs_fs_info *fs_info = eb->fs_info; spin_unlock(&eb->refs_lock); - spin_lock(&tree->buffer_lock); - radix_tree_delete(&tree->buffer, + spin_lock(&fs_info->buffer_lock); + radix_tree_delete(&fs_info->buffer_radix, eb->start >> PAGE_CACHE_SHIFT); - spin_unlock(&tree->buffer_lock); + spin_unlock(&fs_info->buffer_lock); + } else { + spin_unlock(&eb->refs_lock); } /* Should be safe to release our pages at this point */ @@ -5112,12 +5153,12 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " "len %lu dst len %lu\n", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " "len %lu dst len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } @@ -5159,12 +5200,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " "len %lu len %lu\n", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 19620c58f096..58b27e5ab521 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -43,6 +43,7 @@ #define EXTENT_BUFFER_WRITEBACK 7 #define EXTENT_BUFFER_IOERR 8 #define EXTENT_BUFFER_DUMMY 9 +#define EXTENT_BUFFER_IN_TREE 10 /* these are flags for extent_clear_unlock_delalloc */ #define PAGE_UNLOCK (1 << 0) @@ -94,12 +95,10 @@ struct extent_io_ops { struct extent_io_tree { struct rb_root state; - struct radix_tree_root buffer; struct address_space *mapping; u64 dirty_bytes; int track_uptodate; spinlock_t lock; - spinlock_t buffer_lock; struct extent_io_ops *ops; }; @@ -130,7 +129,7 @@ struct extent_buffer { unsigned long map_start; unsigned long map_len; unsigned long bflags; - struct extent_io_tree *tree; + struct btrfs_fs_info *fs_info; spinlock_t refs_lock; atomic_t refs; atomic_t io_pages; @@ -266,11 +265,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); -struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, +struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); -struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, +struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); void free_extent_buffer(struct extent_buffer *eb); void free_extent_buffer_stale(struct extent_buffer *eb); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a4a7a1a8da95..996ad56b57db 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -79,12 +79,21 @@ void free_extent_map(struct extent_map *em) } } -static struct rb_node *tree_insert(struct rb_root *root, u64 offset, - struct rb_node *node) +/* simple helper to do math around the end of an extent, handling wrap */ +static u64 range_end(u64 start, u64 len) +{ + if (start + len < start) + return (u64)-1; + return start + len; +} + +static int tree_insert(struct rb_root *root, struct extent_map *em) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; - struct extent_map *entry; + struct extent_map *entry = NULL; + struct rb_node *orig_parent = NULL; + u64 end = range_end(em->start, em->len); while (*p) { parent = *p; @@ -92,19 +101,37 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, WARN_ON(!entry->in_tree); - if (offset < entry->start) + if (em->start < entry->start) p = &(*p)->rb_left; - else if (offset >= extent_map_end(entry)) + else if (em->start >= extent_map_end(entry)) p = &(*p)->rb_right; else - return parent; + return -EEXIST; } - entry = rb_entry(node, struct extent_map, rb_node); - entry->in_tree = 1; - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; + orig_parent = parent; + while (parent && em->start >= extent_map_end(entry)) { + parent = rb_next(parent); + entry = rb_entry(parent, struct extent_map, rb_node); + } + if (parent) + if (end > entry->start && em->start < extent_map_end(entry)) + return -EEXIST; + + parent = orig_parent; + entry = rb_entry(parent, struct extent_map, rb_node); + while (parent && em->start < entry->start) { + parent = rb_prev(parent); + entry = rb_entry(parent, struct extent_map, rb_node); + } + if (parent) + if (end > entry->start && em->start < extent_map_end(entry)) + return -EEXIST; + + em->in_tree = 1; + rb_link_node(&em->rb_node, orig_parent, p); + rb_insert_color(&em->rb_node, root); + return 0; } /* @@ -228,7 +255,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) merge = rb_entry(rb, struct extent_map, rb_node); if (rb && mergable_maps(em, merge)) { em->len += merge->len; - em->block_len += merge->len; + em->block_len += merge->block_len; rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; @@ -310,20 +337,11 @@ int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified) { int ret = 0; - struct rb_node *rb; - struct extent_map *exist; - exist = lookup_extent_mapping(tree, em->start, em->len); - if (exist) { - free_extent_map(exist); - ret = -EEXIST; - goto out; - } - rb = tree_insert(&tree->map, em->start, &em->rb_node); - if (rb) { - ret = -EEXIST; + ret = tree_insert(&tree->map, em); + if (ret) goto out; - } + atomic_inc(&em->refs); em->mod_start = em->start; @@ -337,14 +355,6 @@ out: return ret; } -/* simple helper to do math around the end of an extent, handling wrap */ -static u64 range_end(u64 start, u64 len) -{ - if (start + len < start) - return (u64)-1; - return start + len; -} - static struct extent_map * __lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len, int strict) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6f3848860283..127555b29f58 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -182,7 +182,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (!path) return -ENOMEM; - nblocks = bio->bi_size >> inode->i_sb->s_blocksize_bits; + nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size, @@ -201,7 +201,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, csum = (u8 *)dst; } - if (bio->bi_size > PAGE_CACHE_SIZE * 8) + if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8) path->reada = 2; WARN_ON(bio->bi_vcnt <= 0); @@ -217,7 +217,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, path->skip_locking = 1; } - disk_bytenr = (u64)bio->bi_sector << 9; + disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; if (dio) offset = logical_offset; while (bio_index < bio->bi_vcnt) { @@ -246,8 +246,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, offset + bvec->bv_len - 1, EXTENT_NODATASUM, GFP_NOFS); } else { - printk(KERN_INFO "btrfs no csum found " - "for inode %llu start %llu\n", + btrfs_info(BTRFS_I(inode)->root->fs_info, + "no csum found for inode %llu start %llu", btrfs_ino(inode), offset); } item = NULL; @@ -302,7 +302,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, struct btrfs_dio_private *dip, struct bio *bio, u64 offset) { - int len = (bio->bi_sector << 9) - dip->disk_bytenr; + int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int ret; @@ -447,11 +447,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, u64 offset; WARN_ON(bio->bi_vcnt <= 0); - sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); + sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size), + GFP_NOFS); if (!sums) return -ENOMEM; - sums->len = bio->bi_size; + sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); if (contig) @@ -461,7 +462,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ - sums->bytenr = (u64)bio->bi_sector << 9; + sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; while (bio_index < bio->bi_vcnt) { @@ -476,7 +477,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, btrfs_add_ordered_sum(inode, ordered, sums); btrfs_put_ordered_extent(ordered); - bytes_left = bio->bi_size - total_bytes; + bytes_left = bio->bi_iter.bi_size - total_bytes; sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), GFP_NOFS); @@ -484,7 +485,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ - sums->bytenr = ((u64)bio->bi_sector << 9) + + sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + total_bytes; index = 0; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 82d0342763c5..0165b8672f09 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -692,7 +692,10 @@ next: int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache) + u64 *drop_end, int drop_cache, + int replace_extent, + u32 extent_item_size, + int *key_inserted) { struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -712,6 +715,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int modify_tree = -1; int update_refs = (root->ref_cows || root == root->fs_info->tree_root); int found = 0; + int leafs_visited = 0; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); @@ -733,6 +737,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, path->slots[0]--; } ret = 0; + leafs_visited++; next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -744,6 +749,7 @@ next_slot: ret = 0; break; } + leafs_visited++; leaf = path->nodes[0]; recow = 1; } @@ -766,7 +772,8 @@ next_slot: btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, fi); + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); } else { WARN_ON(1); extent_end = search_start; @@ -927,14 +934,44 @@ next_slot: } if (!ret && del_nr > 0) { + /* + * Set path->slots[0] to first slot, so that after the delete + * if items are move off from our leaf to its immediate left or + * right neighbor leafs, we end up with a correct and adjusted + * path->slots[0] for our insertion. + */ + path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) btrfs_abort_transaction(trans, root, ret); + + leaf = path->nodes[0]; + /* + * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that + * is, its contents got pushed to its neighbors), in which case + * it means path->locks[0] == 0 + */ + if (!ret && replace_extent && leafs_visited == 1 && + path->locks[0] && + btrfs_leaf_free_space(root, leaf) >= + sizeof(struct btrfs_item) + extent_item_size) { + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + setup_items_for_insert(root, path, &key, + &extent_item_size, + extent_item_size, + sizeof(struct btrfs_item) + + extent_item_size, 1); + *key_inserted = 1; + } } + if (!replace_extent || !(*key_inserted)) + btrfs_release_path(path); if (drop_end) *drop_end = found ? min(end, extent_end) : end; - btrfs_release_path(path); return ret; } @@ -949,7 +986,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, - drop_cache); + drop_cache, 0, 0, NULL); btrfs_free_path(path); return ret; } @@ -1235,29 +1272,18 @@ static int prepare_uptodate_page(struct page *page, u64 pos, } /* - * this gets pages into the page cache and locks them down, it also properly - * waits for data=ordered extents to finish before allowing the pages to be - * modified. + * this just gets pages into the page cache and locks them down. */ -static noinline int prepare_pages(struct btrfs_root *root, struct file *file, - struct page **pages, size_t num_pages, - loff_t pos, unsigned long first_index, - size_t write_bytes, bool force_uptodate) +static noinline int prepare_pages(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + size_t write_bytes, bool force_uptodate) { - struct extent_state *cached_state = NULL; int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; - struct inode *inode = file_inode(file); gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; - int faili = 0; - u64 start_pos; - u64 last_pos; - - start_pos = pos & ~((u64)root->sectorsize - 1); - last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; + int faili; -again: for (i = 0; i < num_pages; i++) { pages[i] = find_or_create_page(inode->i_mapping, index + i, mask | __GFP_WRITE); @@ -1280,57 +1306,85 @@ again: } wait_on_page_writeback(pages[i]); } - faili = num_pages - 1; - err = 0; + + return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + +} + +/* + * This function locks the extent and properly waits for data=ordered extents + * to finish before allowing the pages to be modified if need. + * + * The return value: + * 1 - the extent is locked + * 0 - the extent is not locked, and everything is OK + * -EAGAIN - need re-prepare the pages + * the other < 0 number - Something wrong happens + */ +static noinline int +lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + u64 *lockstart, u64 *lockend, + struct extent_state **cached_state) +{ + u64 start_pos; + u64 last_pos; + int i; + int ret = 0; + + start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); + last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1; + if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, 0, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, - last_pos - 1); + start_pos, last_pos, 0, cached_state); + ordered = btrfs_lookup_first_ordered_extent(inode, last_pos); if (ordered && ordered->file_offset + ordered->len > start_pos && - ordered->file_offset < last_pos) { + ordered->file_offset <= last_pos) { btrfs_put_ordered_extent(ordered); unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, - &cached_state, GFP_NOFS); + start_pos, last_pos, + cached_state, GFP_NOFS); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); page_cache_release(pages[i]); } - err = btrfs_wait_ordered_range(inode, start_pos, - last_pos - start_pos); - if (err) - goto fail; - goto again; + ret = btrfs_wait_ordered_range(inode, start_pos, + last_pos - start_pos + 1); + if (ret) + return ret; + else + return -EAGAIN; } if (ordered) btrfs_put_ordered_extent(ordered); clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, - last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state, GFP_NOFS); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, &cached_state, - GFP_NOFS); + 0, 0, cached_state, GFP_NOFS); + *lockstart = start_pos; + *lockend = last_pos; + ret = 1; } + for (i = 0; i < num_pages; i++) { if (clear_page_dirty_for_io(pages[i])) account_page_redirty(pages[i]); set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } - return 0; -fail: - while (faili >= 0) { - unlock_page(pages[faili]); - page_cache_release(pages[faili]); - faili--; - } - return err; + return ret; } static noinline int check_can_nocow(struct inode *inode, loff_t pos, @@ -1381,13 +1435,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; + struct extent_state *cached_state = NULL; u64 release_bytes = 0; + u64 lockstart; + u64 lockend; unsigned long first_index; size_t num_written = 0; int nrptrs; int ret = 0; bool only_release_metadata = false; bool force_page_uptodate = false; + bool need_unlock; nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / @@ -1456,18 +1514,31 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } release_bytes = reserve_bytes; - + need_unlock = false; +again: /* * This is going to setup the pages array with the number of * pages we want, so we don't really need to worry about the * contents of pages from loop to loop */ - ret = prepare_pages(root, file, pages, num_pages, - pos, first_index, write_bytes, + ret = prepare_pages(inode, pages, num_pages, + pos, write_bytes, force_page_uptodate); if (ret) break; + ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, + pos, &lockstart, &lockend, + &cached_state); + if (ret < 0) { + if (ret == -EAGAIN) + goto again; + break; + } else if (ret > 0) { + need_unlock = true; + ret = 0; + } + copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, i); @@ -1512,19 +1583,21 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } release_bytes = dirty_pages << PAGE_CACHE_SHIFT; - if (copied > 0) { + + if (copied > 0) ret = btrfs_dirty_pages(root, inode, pages, dirty_pages, pos, copied, NULL); - if (ret) { - btrfs_drop_pages(pages, num_pages); - break; - } + if (need_unlock) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state, + GFP_NOFS); + if (ret) { + btrfs_drop_pages(pages, num_pages); + break; } release_bytes = 0; - btrfs_drop_pages(pages, num_pages); - if (only_release_metadata && copied > 0) { u64 lockstart = round_down(pos, root->sectorsize); u64 lockend = lockstart + @@ -1536,6 +1609,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, only_release_metadata = false; } + btrfs_drop_pages(pages, num_pages); + cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); @@ -1857,12 +1932,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (file->private_data) btrfs_ioctl_trans_end(file); + /* + * We use start here because we will need to wait on the IO to complete + * in btrfs_sync_log, which could require joining a transaction (for + * example checking cross references in the nocow path). If we use join + * here we could get into a situation where we're waiting on IO to + * happen that is blocked on a transaction trying to commit. With start + * we inc the extwriter counter, so we wait for all extwriters to exit + * before we start blocking join'ers. This comment is to keep somebody + * from thinking they are super smart and changing this to + * btrfs_join_transaction *cough*Josef*cough*. + */ trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); mutex_unlock(&inode->i_mutex); goto out; } + trans->sync = true; ret = btrfs_log_dentry_safe(trans, root, dentry); if (ret < 0) { @@ -1963,11 +2050,13 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_key key; int ret; + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) + goto out; + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = offset; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) return ret; @@ -2064,8 +2153,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 drop_end; int ret = 0; int err = 0; + int rsv_count; bool same_page = ((offset >> PAGE_CACHE_SHIFT) == ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) @@ -2125,7 +2216,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * we need to try again. */ if ((!ordered || - (ordered->file_offset + ordered->len < lockstart || + (ordered->file_offset + ordered->len <= lockstart || ordered->file_offset > lockend)) && !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, EXTENT_UPTODATE, 0, @@ -2163,9 +2254,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* * 1 - update the inode * 1 - removing the extents in the range - * 1 - adding the hole extent + * 1 - adding the hole extent if no_holes isn't set */ - trans = btrfs_start_transaction(root, 3); + rsv_count = no_holes ? 2 : 3; + trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; @@ -2179,7 +2271,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) while (cur_offset < lockend) { ret = __btrfs_drop_extents(trans, root, inode, path, cur_offset, lockend + 1, - &drop_end, 1); + &drop_end, 1, 0, 0, NULL); if (ret != -ENOSPC) break; @@ -2202,7 +2294,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); - trans = btrfs_start_transaction(root, 3); + trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 057be95b1e1e..73f3de7a083c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -347,8 +347,8 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, btrfs_readpage(NULL, page); lock_page(page); if (!PageUptodate(page)) { - printk(KERN_ERR "btrfs: error reading free " - "space cache\n"); + btrfs_err(BTRFS_I(inode)->root->fs_info, + "error reading free space cache"); io_ctl_drop_pages(io_ctl); return -EIO; } @@ -405,7 +405,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) gen = io_ctl->cur; if (le64_to_cpu(*gen) != generation) { - printk_ratelimited(KERN_ERR "btrfs: space cache generation " + printk_ratelimited(KERN_ERR "BTRFS: space cache generation " "(%Lu) does not match inode (%Lu)\n", *gen, generation); io_ctl_unmap_page(io_ctl); @@ -463,7 +463,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); if (val != crc) { - printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free " + printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free " "space cache\n"); io_ctl_unmap_page(io_ctl); return -EIO; @@ -1902,7 +1902,7 @@ out: spin_unlock(&ctl->tree_lock); if (ret) { - printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); + printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret); ASSERT(ret != -EEXIST); } @@ -2011,14 +2011,15 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes && !block_group->ro) count++; - printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", - info->offset, info->bytes, + btrfs_crit(block_group->fs_info, + "entry offset %llu, bytes %llu, bitmap %s", + info->offset, info->bytes, (info->bitmap) ? "yes" : "no"); } - printk(KERN_INFO "block group has cluster?: %s\n", + btrfs_info(block_group->fs_info, "block group has cluster?: %s", list_empty(&block_group->cluster_list) ? "no" : "yes"); - printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" - "\n", count); + btrfs_info(block_group->fs_info, + "%d blocks of free space at or bigger than bytes is", count); } void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) @@ -2421,7 +2422,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry = NULL; struct btrfs_free_space *last; struct rb_node *node; - u64 window_start; u64 window_free; u64 max_extent; u64 total_size = 0; @@ -2443,7 +2443,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); } - window_start = entry->offset; window_free = entry->bytes; max_extent = entry->bytes; first = entry; diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c new file mode 100644 index 000000000000..85889aa82c62 --- /dev/null +++ b/fs/btrfs/hash.c @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <crypto/hash.h> +#include <linux/err.h> +#include "hash.h" + +static struct crypto_shash *tfm; + +int __init btrfs_hash_init(void) +{ + tfm = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + return 0; +} + +void btrfs_hash_exit(void) +{ + crypto_free_shash(tfm); +} + +u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(tfm)]; + } desc; + int err; + + desc.shash.tfm = tfm; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index 1d982812ab67..118a2316e5d3 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -19,10 +19,15 @@ #ifndef __HASH__ #define __HASH__ -#include <linux/crc32c.h> +int __init btrfs_hash_init(void); + +void btrfs_hash_exit(void); + +u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length); + static inline u64 btrfs_name_hash(const char *name, int len) { - return crc32c((u32)~1, name, len); + return btrfs_crc32c((u32)~1, name, len); } /* @@ -31,7 +36,7 @@ static inline u64 btrfs_name_hash(const char *name, int len) static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len) { - return (u64) crc32c(parent_objectid, name, len); + return (u64) btrfs_crc32c(parent_objectid, name, len); } #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index ec82fae07097..2be38df703c9 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -91,32 +91,6 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, return 0; } -static struct btrfs_inode_ref * -btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int ins_len, - int cow) -{ - int ret; - struct btrfs_key key; - struct btrfs_inode_ref *ref; - - key.objectid = inode_objectid; - key.type = BTRFS_INODE_REF_KEY; - key.offset = ref_objectid; - - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - return NULL; - if (!find_name_in_backref(path, name, name_len, &ref)) - return NULL; - return ref; -} - /* Returns NULL if no extref found */ struct btrfs_inode_extref * btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, @@ -144,45 +118,6 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, return extref; } -int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int mod, - u64 *ret_index) -{ - struct btrfs_inode_ref *ref; - struct btrfs_inode_extref *extref; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; - - ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len, - inode_objectid, ref_objectid, ins_len, - cow); - if (IS_ERR(ref)) - return PTR_ERR(ref); - - if (ref != NULL) { - *ret_index = btrfs_inode_ref_index(path->nodes[0], ref); - return 0; - } - - btrfs_release_path(path); - - extref = btrfs_lookup_inode_extref(trans, root, path, name, - name_len, inode_objectid, - ref_objectid, ins_len, cow); - if (IS_ERR(extref)) - return PTR_ERR(extref); - - if (extref) { - *ret_index = btrfs_inode_extref_index(path->nodes[0], extref); - return 0; - } - - return -ENOENT; -} - static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 514b291b1354..d3d44486290b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -58,9 +58,10 @@ #include "inode-map.h" #include "backref.h" #include "hash.h" +#include "props.h" struct btrfs_iget_args { - u64 ino; + struct btrfs_key *location; struct btrfs_root *root; }; @@ -125,13 +126,12 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * no overlapping inline items exist in the btree */ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int extent_inserted, struct btrfs_root *root, struct inode *inode, u64 start, size_t size, size_t compressed_size, int compress_type, struct page **compressed_pages) { - struct btrfs_key key; - struct btrfs_path *path; struct extent_buffer *leaf; struct page *page = NULL; char *kaddr; @@ -140,29 +140,29 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, int err = 0; int ret; size_t cur_size = size; - size_t datasize; unsigned long offset; if (compressed_size && compressed_pages) cur_size = compressed_size; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + inode_add_bytes(inode, size); - path->leave_spinning = 1; + if (!extent_inserted) { + struct btrfs_key key; + size_t datasize; - key.objectid = btrfs_ino(inode); - key.offset = start; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - datasize = btrfs_file_extent_calc_inline_size(cur_size); + key.objectid = btrfs_ino(inode); + key.offset = start; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - inode_add_bytes(inode, size); - ret = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - if (ret) { - err = ret; - goto fail; + datasize = btrfs_file_extent_calc_inline_size(cur_size); + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (ret) { + err = ret; + goto fail; + } } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -203,7 +203,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, page_cache_release(page); } btrfs_mark_buffer_dirty(leaf); - btrfs_free_path(path); + btrfs_release_path(path); /* * we're an inline extent, so nobody can @@ -219,7 +219,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, return ret; fail: - btrfs_free_path(path); return err; } @@ -242,6 +241,9 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, u64 aligned_end = ALIGN(end, root->sectorsize); u64 data_len = inline_len; int ret; + struct btrfs_path *path; + int extent_inserted = 0; + u32 extent_item_size; if (compressed_size) data_len = compressed_size; @@ -256,12 +258,27 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, return 1; } + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + btrfs_free_path(path); return PTR_ERR(trans); + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1); + if (compressed_size && compressed_pages) + extent_item_size = btrfs_file_extent_calc_inline_size( + compressed_size); + else + extent_item_size = btrfs_file_extent_calc_inline_size( + inline_len); + + ret = __btrfs_drop_extents(trans, root, inode, path, + start, aligned_end, NULL, + 1, 1, extent_item_size, &extent_inserted); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out; @@ -269,7 +286,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); - ret = insert_inline_extent(trans, root, inode, start, + ret = insert_inline_extent(trans, path, extent_inserted, + root, inode, start, inline_len, compressed_size, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { @@ -284,6 +302,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: + btrfs_free_path(path); btrfs_end_transaction(trans, root); return ret; } @@ -1262,7 +1281,8 @@ next_slot: nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, fi); + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); extent_end = ALIGN(extent_end, root->sectorsize); } else { BUG_ON(1); @@ -1577,7 +1597,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - u64 logical = (u64)bio->bi_sector << 9; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; int ret; @@ -1585,7 +1605,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; - length = bio->bi_size; + length = bio->bi_iter.bi_size; map_length = length; ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, NULL, 0); @@ -1841,14 +1861,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; + int extent_inserted = 0; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; - /* * we may be replacing one extent in the tree with another. * The new extent is pinned in the extent map, and we don't want @@ -1858,17 +1877,23 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, 0); + ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, + file_pos + num_bytes, NULL, 0, + 1, sizeof(*fi), &extent_inserted); if (ret) goto out; - ins.objectid = btrfs_ino(inode); - ins.offset = file_pos; - ins.type = BTRFS_EXTENT_DATA_KEY; - ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); - if (ret) - goto out; + if (!extent_inserted) { + ins.objectid = btrfs_ino(inode); + ins.offset = file_pos; + ins.type = BTRFS_EXTENT_DATA_KEY; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &ins, + sizeof(*fi)); + if (ret) + goto out; + } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -2290,7 +2315,7 @@ again: u64 extent_len; struct btrfs_key found_key; - ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) goto out_free_path; @@ -2543,12 +2568,6 @@ out_kfree: return NULL; } -/* - * helper function for btrfs_finish_ordered_io, this - * just reads in some of the csum leaves to prime them into ram - * before we start the transaction. It limits the amount of btree - * reads required while inside the transaction. - */ /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. @@ -2610,7 +2629,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) EXTENT_DEFRAG, 1, cached_state); if (ret) { u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); - if (last_snapshot >= BTRFS_I(inode)->generation) + if (0 && last_snapshot >= BTRFS_I(inode)->generation) /* the inode is shared */ new = record_old_file_extents(inode, ordered_extent); @@ -3248,7 +3267,8 @@ out: * slot is the slot the inode is in, objectid is the objectid of the inode */ static noinline int acls_after_inode_item(struct extent_buffer *leaf, - int slot, u64 objectid) + int slot, u64 objectid, + int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; @@ -3264,6 +3284,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, } slot++; + *first_xattr_slot = -1; while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -3273,6 +3294,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, /* we found an xattr, assume we've got an acl */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default) return 1; @@ -3301,6 +3324,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, * something larger than an xattr. We have to assume the inode * has acls */ + if (*first_xattr_slot == -1) + *first_xattr_slot = slot; return 1; } @@ -3315,10 +3340,12 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; + unsigned long ptr; int maybe_acls; u32 rdev; int ret; bool filled = false; + int first_xattr_slot; ret = btrfs_fill_inode(inode, &rdev); if (!ret) @@ -3328,7 +3355,6 @@ static void btrfs_read_locked_inode(struct inode *inode) if (!path) goto make_bad; - path->leave_spinning = 1; memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); @@ -3338,7 +3364,7 @@ static void btrfs_read_locked_inode(struct inode *inode) leaf = path->nodes[0]; if (filled) - goto cache_acl; + goto cache_index; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -3381,18 +3407,51 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->index_cnt = (u64)-1; BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + +cache_index: + path->slots[0]++; + if (inode->i_nlink != 1 || + path->slots[0] >= btrfs_header_nritems(leaf)) + goto cache_acl; + + btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); + if (location.objectid != btrfs_ino(inode)) + goto cache_acl; + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + if (location.type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + } else if (location.type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)ptr; + BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, + extref); + } cache_acl: /* * try to precache a NULL acl entry for files that don't have * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(inode)); + btrfs_ino(inode), &first_xattr_slot); + if (first_xattr_slot != -1) { + path->slots[0] = first_xattr_slot; + ret = btrfs_load_inode_props(inode, path); + if (ret) + btrfs_err(root->fs_info, + "error loading props for ino %llu (root %llu): %d\n", + btrfs_ino(inode), + root->root_key.objectid, ret); + } + btrfs_free_path(path); + if (!maybe_acls) cache_no_acl(inode); - btrfs_free_path(path); - switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; @@ -3496,7 +3555,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, goto failed; } - btrfs_unlock_up_safe(path, 1); leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -3593,6 +3651,24 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; btrfs_release_path(path); + /* + * If we don't have dir index, we have to get it by looking up + * the inode ref, since we get the inode ref, remove it directly, + * it is unnecessary to do delayed deletion. + * + * But if we have dir index, needn't search inode ref to get it. + * Since the inode ref is close to the inode item, it is better + * that we delay to delete it, and just do this deletion when + * we update the inode item. + */ + if (BTRFS_I(inode)->dir_index) { + ret = btrfs_delayed_delete_inode_ref(inode); + if (!ret) { + index = BTRFS_I(inode)->dir_index; + goto skip_backref; + } + } + ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, dir_ino, &index); if (ret) { @@ -3602,7 +3678,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, ret); goto err; } - +skip_backref: ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -3948,7 +4024,7 @@ search_again: btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_inline_len(leaf, - fi); + path->slots[0], fi); } item_end--; } @@ -4018,6 +4094,12 @@ search_again: inode_sub_bytes(inode, item_end + 1 - new_size); } + + /* + * update the ram bytes to properly reflect + * the new size of our item + */ + btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); btrfs_truncate_item(root, path, size, 1); @@ -4203,6 +4285,49 @@ out: return ret; } +static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, + u64 offset, u64 len) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* + * Still need to make sure the inode looks like it's been updated so + * that any holes get logged if we fsync. + */ + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) { + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = root->log_transid; + BTRFS_I(inode)->last_log_commit = root->last_log_commit; + return 0; + } + + /* + * 1 - for the one we're dropping + * 1 - for the one we're adding + * 1 - for updating the inode. + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + return ret; + } + + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, + 0, 0, len, 0, len, 0, 0, 0); + if (ret) + btrfs_abort_transaction(trans, root, ret); + else + btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); + return ret; +} + /* * This function puts in dummy file extents for the area we're creating a hole * for. So if we are truncating this file to a larger size we need to insert @@ -4211,7 +4336,6 @@ out: */ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em = NULL; @@ -4266,31 +4390,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) struct extent_map *hole_em; hole_size = last_byte - cur_offset; - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - break; - } - - err = btrfs_drop_extents(trans, root, inode, - cur_offset, - cur_offset + hole_size, 1); - if (err) { - btrfs_abort_transaction(trans, root, err); - btrfs_end_transaction(trans, root); - break; - } - - err = btrfs_insert_file_extent(trans, root, - btrfs_ino(inode), cur_offset, 0, - 0, hole_size, 0, hole_size, - 0, 0, 0); - if (err) { - btrfs_abort_transaction(trans, root, err); - btrfs_end_transaction(trans, root); + err = maybe_insert_hole(root, inode, cur_offset, + hole_size); + if (err) break; - } - btrfs_drop_extent_cache(inode, cur_offset, cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); @@ -4309,7 +4412,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_em->ram_bytes = hole_size; hole_em->bdev = root->fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; - hole_em->generation = trans->transid; + hole_em->generation = root->fs_info->generation; while (1) { write_lock(&em_tree->lock); @@ -4322,17 +4425,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_size - 1, 0); } free_extent_map(hole_em); -next: - btrfs_update_inode(trans, root, inode); - btrfs_end_transaction(trans, root); } +next: free_extent_map(em); em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } - free_extent_map(em); unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, GFP_NOFS); @@ -4474,6 +4574,64 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; } +/* + * While truncating the inode pages during eviction, we get the VFS calling + * btrfs_invalidatepage() against each page of the inode. This is slow because + * the calls to btrfs_invalidatepage() result in a huge amount of calls to + * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting + * extent_state structures over and over, wasting lots of time. + * + * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all + * those expensive operations on a per page basis and do only the ordered io + * finishing, while we release here the extent_map and extent_state structures, + * without the excessive merging and splitting. + */ +static void evict_inode_truncate_pages(struct inode *inode) +{ + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; + struct rb_node *node; + + ASSERT(inode->i_state & I_FREEING); + truncate_inode_pages(&inode->i_data, 0); + + write_lock(&map_tree->lock); + while (!RB_EMPTY_ROOT(&map_tree->map)) { + struct extent_map *em; + + node = rb_first(&map_tree->map); + em = rb_entry(node, struct extent_map, rb_node); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + remove_extent_mapping(map_tree, em); + free_extent_map(em); + } + write_unlock(&map_tree->lock); + + spin_lock(&io_tree->lock); + while (!RB_EMPTY_ROOT(&io_tree->state)) { + struct extent_state *state; + struct extent_state *cached_state = NULL; + + node = rb_first(&io_tree->state); + state = rb_entry(node, struct extent_state, rb_node); + atomic_inc(&state->refs); + spin_unlock(&io_tree->lock); + + lock_extent_bits(io_tree, state->start, state->end, + 0, &cached_state); + clear_extent_bit(io_tree, state->start, state->end, + EXTENT_LOCKED | EXTENT_DIRTY | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); + free_extent_state(state); + + spin_lock(&io_tree->lock); + } + spin_unlock(&io_tree->lock); +} + void btrfs_evict_inode(struct inode *inode) { struct btrfs_trans_handle *trans; @@ -4484,7 +4642,8 @@ void btrfs_evict_inode(struct inode *inode) trace_btrfs_inode_evict(inode); - truncate_inode_pages(&inode->i_data, 0); + evict_inode_truncate_pages(inode); + if (inode->i_nlink && ((btrfs_root_refs(&root->root_item) != 0 && root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || @@ -4659,9 +4818,9 @@ static int fixup_tree_root_location(struct btrfs_root *root, } err = -ENOENT; - ret = btrfs_find_root_ref(root->fs_info->tree_root, path, - BTRFS_I(dir)->root->root_key.objectid, - location->objectid); + ret = btrfs_find_item(root->fs_info->tree_root, path, + BTRFS_I(dir)->root->root_key.objectid, + location->objectid, BTRFS_ROOT_REF_KEY, NULL); if (ret) { if (ret < 0) err = ret; @@ -4822,7 +4981,9 @@ again: static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; - inode->i_ino = args->ino; + inode->i_ino = args->location->objectid; + memcpy(&BTRFS_I(inode)->location, args->location, + sizeof(*args->location)); BTRFS_I(inode)->root = args->root; return 0; } @@ -4830,19 +4991,19 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; - return args->ino == btrfs_ino(inode) && + return args->location->objectid == BTRFS_I(inode)->location.objectid && args->root == BTRFS_I(inode)->root; } static struct inode *btrfs_iget_locked(struct super_block *s, - u64 objectid, + struct btrfs_key *location, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; - unsigned long hashval = btrfs_inode_hash(objectid, root); + unsigned long hashval = btrfs_inode_hash(location->objectid, root); - args.ino = objectid; + args.location = location; args.root = root; inode = iget5_locked(s, hashval, btrfs_find_actor, @@ -4859,13 +5020,11 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, { struct inode *inode; - inode = btrfs_iget_locked(s, location->objectid, root); + inode = btrfs_iget_locked(s, location, root); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); if (!is_bad_inode(inode)) { inode_tree_add(inode); @@ -4921,7 +5080,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (location.objectid == 0) - return NULL; + return ERR_PTR(-ENOENT); if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(dir->i_sb, &location, root, NULL); @@ -4985,10 +5144,17 @@ static void btrfs_dentry_release(struct dentry *dentry) static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct dentry *ret; + struct inode *inode; - ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); - return ret; + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOENT) + inode = NULL; + else + return ERR_CAST(inode); + } + + return d_materialise_unique(dentry, inode); } unsigned char btrfs_filetype_table[] = { @@ -5358,7 +5524,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, u32 sizes[2]; unsigned long ptr; int ret; - int owner; path = btrfs_alloc_path(); if (!path) @@ -5392,6 +5557,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, * number */ BTRFS_I(inode)->index_cnt = 2; + BTRFS_I(inode)->dir_index = *index; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; @@ -5404,11 +5570,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - if (S_ISDIR(mode)) - owner = 0; - else - owner = 1; - key[0].objectid = objectid; btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); key[0].offset = 0; @@ -5473,6 +5634,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_update_root_times(trans, root); + ret = btrfs_inode_inherit_props(trans, inode, dir); + if (ret) + btrfs_err(root->fs_info, + "error inheriting props for ino %llu (root %llu): %d", + btrfs_ino(inode), root->root_key.objectid, ret); + return inode; fail: if (dir) @@ -5741,6 +5908,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } + /* There are several dir indexes for this inode, clear the cache. */ + BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; @@ -6004,7 +6173,7 @@ again: btrfs_file_extent_num_bytes(leaf, item); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; - size = btrfs_file_extent_inline_len(leaf, item); + size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_end = ALIGN(extent_start + size, root->sectorsize); } next: @@ -6073,7 +6242,7 @@ next: goto out; } - size = btrfs_file_extent_inline_len(leaf, item); + size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_offset = page_offset(page) + pg_offset - extent_start; copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, size - extent_offset); @@ -6390,6 +6559,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, int slot; int found_type; bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -6433,6 +6603,10 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) goto out; + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end <= offset) + goto out; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); if (disk_bytenr == 0) goto out; @@ -6450,8 +6624,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); } - extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (btrfs_extent_readonly(root, disk_bytenr)) goto out; btrfs_release_path(path); @@ -6783,17 +6955,16 @@ unlock_err: static void btrfs_endio_direct_read(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; - struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *dio_bio; u32 *csums = (u32 *)dip->csum; - int index = 0; u64 start; + int i; start = dip->logical_offset; - do { + bio_for_each_segment_all(bvec, bio, i) { if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { struct page *page = bvec->bv_page; char *kaddr; @@ -6809,18 +6980,16 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) local_irq_restore(flags); flush_dcache_page(bvec->bv_page); - if (csum != csums[index]) { + if (csum != csums[i]) { btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", btrfs_ino(inode), start, csum, - csums[index]); + csums[i]); err = -EIO; } } start += bvec->bv_len; - bvec++; - index++; - } while (bvec <= bvec_end); + } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -6898,10 +7067,11 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) struct btrfs_dio_private *dip = bio->bi_private; if (err) { - printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " - "sector %#Lx len %u err no %d\n", + btrfs_err(BTRFS_I(dip->inode)->root->fs_info, + "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", btrfs_ino(dip->inode), bio->bi_rw, - (unsigned long long)bio->bi_sector, bio->bi_size, err); + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, err); dip->errors = 1; /* @@ -6992,7 +7162,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, struct bio *bio; struct bio *orig_bio = dip->orig_bio; struct bio_vec *bvec = orig_bio->bi_io_vec; - u64 start_sector = orig_bio->bi_sector; + u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; u64 submit_len = 0; u64 map_length; @@ -7000,7 +7170,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int ret = 0; int async_submit = 0; - map_length = orig_bio->bi_size; + map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); if (ret) { @@ -7008,7 +7178,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, return -EIO; } - if (map_length >= orig_bio->bi_size) { + if (map_length >= orig_bio->bi_iter.bi_size) { bio = orig_bio; goto submit; } @@ -7060,7 +7230,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; - map_length = orig_bio->bi_size; + map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); @@ -7118,7 +7288,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, if (!skip_sum && !write) { csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits; + sum_len = dio_bio->bi_iter.bi_size >> + inode->i_sb->s_blocksize_bits; sum_len *= csum_size; } else { sum_len = 0; @@ -7133,8 +7304,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip->private = dio_bio->bi_private; dip->inode = inode; dip->logical_offset = file_offset; - dip->bytes = dio_bio->bi_size; - dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; + dip->bytes = dio_bio->bi_iter.bi_size; + dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; io_bio->bi_private = dip; dip->errors = 0; dip->orig_bio = io_bio; @@ -7371,6 +7542,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + int inode_evicting = inode->i_state & I_FREEING; /* * we have the page locked, so new writeback can't start, @@ -7386,17 +7558,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, btrfs_releasepage(page, GFP_NOFS); return; } - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); + + if (!inode_evicting) + lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { /* * IO on this page will never be started, so we need * to account for any ordered extents now */ - clear_extent_bit(tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); + if (!inode_evicting) + clear_extent_bit(tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 0, &cached_state, + GFP_NOFS); /* * whoever cleared the private bit is responsible * for the finish_ordered_io @@ -7420,14 +7596,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, btrfs_finish_ordered_io(ordered); } btrfs_put_ordered_extent(ordered); - cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + if (!inode_evicting) { + cached_state = NULL; + lock_extent_bits(tree, page_start, page_end, 0, + &cached_state); + } + } + + if (!inode_evicting) { + clear_extent_bit(tree, page_start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); + + __btrfs_releasepage(page, GFP_NOFS); } - clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, - &cached_state, GFP_NOFS); - __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); if (PagePrivate(page)) { @@ -7737,7 +7921,9 @@ out: * create a new subvolume directory/inode (helper for the ioctl). */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, u64 new_dirid) + struct btrfs_root *new_root, + struct btrfs_root *parent_root, + u64 new_dirid) { struct inode *inode; int err; @@ -7755,6 +7941,12 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, set_nlink(inode, 1); btrfs_i_size_write(inode, 0); + err = btrfs_subvol_inherit_props(trans, new_root, parent_root); + if (err) + btrfs_err(new_root->fs_info, + "error inheriting subvolume %llu properties: %d\n", + new_root->root_key.objectid, err); + err = btrfs_update_inode(trans, new_root, inode); iput(inode); @@ -7780,6 +7972,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->flags = 0; ei->csum_bytes = 0; ei->index_cnt = (u64)-1; + ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_log_commit = 0; @@ -8067,6 +8260,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (ret) goto out_fail; + BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ root->fs_info->last_trans_log_full_commit = trans->transid; @@ -8155,6 +8349,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_fail; } + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = index; + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { struct dentry *parent = new_dentry->d_parent; btrfs_log_new_name(trans, old_inode, old_dir, parent); @@ -8290,7 +8487,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { int ret; - if (root->fs_info->sb->s_flags & MS_RDONLY) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return -EROFS; ret = __start_delalloc_inodes(root, delay_iput); @@ -8316,7 +8513,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) struct list_head splice; int ret; - if (fs_info->sb->s_flags & MS_RDONLY) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; INIT_LIST_HEAD(&splice); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ad27dcea319c..a6d8efa46bfe 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -56,6 +56,8 @@ #include "rcu-string.h" #include "send.h" #include "dev-replace.h" +#include "props.h" +#include "sysfs.h" static int btrfs_clone(struct inode *src, struct inode *inode, u64 off, u64 olen, u64 olen_aligned, u64 destoff); @@ -190,6 +192,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) unsigned int i_oldflags; umode_t mode; + if (!inode_owner_or_capable(inode)) + return -EPERM; + if (btrfs_root_readonly(root)) return -EROFS; @@ -200,9 +205,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (ret) return ret; - if (!inode_owner_or_capable(inode)) - return -EACCES; - ret = mnt_want_write_file(file); if (ret) return ret; @@ -280,9 +282,25 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (flags & FS_NOCOMP_FL) { ip->flags &= ~BTRFS_INODE_COMPRESS; ip->flags |= BTRFS_INODE_NOCOMPRESS; + + ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); + if (ret && ret != -ENODATA) + goto out_drop; } else if (flags & FS_COMPR_FL) { + const char *comp; + ip->flags |= BTRFS_INODE_COMPRESS; ip->flags &= ~BTRFS_INODE_NOCOMPRESS; + + if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO) + comp = "lzo"; + else + comp = "zlib"; + ret = btrfs_set_prop(inode, "btrfs.compression", + comp, strlen(comp), 0); + if (ret) + goto out_drop; + } else { ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } @@ -392,6 +410,7 @@ static noinline int create_subvol(struct inode *dir, struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; struct timespec cur_time = CURRENT_TIME; + struct inode *inode; int ret; int err; u64 objectid; @@ -417,7 +436,9 @@ static noinline int create_subvol(struct inode *dir, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out; + btrfs_subvolume_release_metadata(root, &block_rsv, + qgroup_reserved); + return ret; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -500,7 +521,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_record_root_in_trans(trans, new_root); - ret = btrfs_create_subvol_root(trans, new_root, new_dirid); + ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); if (ret) { /* We potentially lose an unused inode item here */ btrfs_abort_transaction(trans, root, ret); @@ -542,6 +563,8 @@ static noinline int create_subvol(struct inode *dir, fail: trans->block_rsv = NULL; trans->bytes_reserved = 0; + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); + if (async_transid) { *async_transid = trans->transid; err = btrfs_commit_transaction_async(trans, root, 1); @@ -553,10 +576,12 @@ fail: if (err && !ret) ret = err; - if (!ret) - d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); -out: - btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); + if (!ret) { + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return PTR_ERR(inode); + d_instantiate(dentry, inode); + } return ret; } @@ -642,7 +667,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, ret = PTR_ERR(inode); goto fail; } - BUG_ON(!inode); + d_instantiate(dentry, inode); ret = 0; fail: @@ -1011,7 +1036,7 @@ out: static int cluster_pages_for_defrag(struct inode *inode, struct page **pages, unsigned long start_index, - int num_pages) + unsigned long num_pages) { unsigned long file_end; u64 isize = i_size_read(inode); @@ -1169,8 +1194,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; int extent_thresh = range->extent_thresh; - int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; - int cluster = max_cluster; + unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long cluster = max_cluster; u64 new_align = ~((u64)128 * 1024 - 1); struct page **pages = NULL; @@ -1254,7 +1279,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; if (btrfs_defrag_cancelled(root->fs_info)) { - printk(KERN_DEBUG "btrfs: defrag_file cancelled\n"); + printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n"); ret = -EAGAIN; break; } @@ -1416,20 +1441,20 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_free; } - printk(KERN_INFO "btrfs: resizing devid %llu\n", devid); + btrfs_info(root->fs_info, "resizing devid %llu", devid); } device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", + btrfs_info(root->fs_info, "resizer unable to find device %llu", devid); ret = -ENODEV; goto out_free; } if (!device->writeable) { - printk(KERN_INFO "btrfs: resizer unable to apply on " - "readonly device %llu\n", + btrfs_info(root->fs_info, + "resizer unable to apply on readonly device %llu", devid); ret = -EPERM; goto out_free; @@ -1466,6 +1491,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, } new_size = old_size - new_size; } else if (mod > 0) { + if (new_size > ULLONG_MAX - old_size) { + ret = -EINVAL; + goto out_free; + } new_size = old_size + new_size; } @@ -1481,7 +1510,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n", + printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", rcu_str_deref(device->name), new_size); if (new_size > old_size) { @@ -1542,9 +1571,15 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, src_inode = file_inode(src.file); if (src_inode->i_sb != file_inode(file)->i_sb) { - printk(KERN_INFO "btrfs: Snapshot src from " - "another FS\n"); + btrfs_info(BTRFS_I(src_inode)->root->fs_info, + "Snapshot src from another FS"); ret = -EINVAL; + } else if (!inode_owner_or_capable(src_inode)) { + /* + * Subvolume creation is not restricted, but snapshots + * are limited to own subvolumes only + */ + ret = -EPERM; } else { ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, @@ -1662,6 +1697,9 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, u64 flags; int ret = 0; + if (!inode_owner_or_capable(inode)) + return -EPERM; + ret = mnt_want_write_file(file); if (ret) goto out; @@ -1686,11 +1724,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_drop_write; } - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out_drop_write; - } - down_write(&root->fs_info->subvol_sem); /* nothing to do */ @@ -1698,12 +1731,28 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_drop_sem; root_flags = btrfs_root_flags(&root->root_item); - if (flags & BTRFS_SUBVOL_RDONLY) + if (flags & BTRFS_SUBVOL_RDONLY) { btrfs_set_root_flags(&root->root_item, root_flags | BTRFS_ROOT_SUBVOL_RDONLY); - else - btrfs_set_root_flags(&root->root_item, + } else { + /* + * Block RO -> RW transition if this subvolume is involved in + * send + */ + spin_lock(&root->root_item_lock); + if (root->send_in_progress == 0) { + btrfs_set_root_flags(&root->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); + spin_unlock(&root->root_item_lock); + } else { + spin_unlock(&root->root_item_lock); + btrfs_warn(root->fs_info, + "Attempt to set subvolume %llu read-write during send", + root->root_key.objectid); + ret = -EPERM; + goto out_drop_sem; + } + } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { @@ -1910,7 +1959,7 @@ static noinline int search_ioctl(struct inode *inode, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "could not find root %llu\n", + printk(KERN_ERR "BTRFS: could not find root %llu\n", sk->tree_id); btrfs_free_path(path); return -ENOENT; @@ -2000,7 +2049,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "could not find root %llu\n", tree_id); + printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id); ret = -ENOENT; goto out; } @@ -2838,12 +2887,14 @@ static int btrfs_clone(struct inode *src, struct inode *inode, * note the key will change type as we walk through the * tree. */ + path->leave_spinning = 1; ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 0, 0); if (ret < 0) goto out; nritems = btrfs_header_nritems(path->nodes[0]); +process_slot: if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) @@ -2870,11 +2921,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u8 comp; u64 endoff; - size = btrfs_item_size_nr(leaf, slot); - read_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); comp = btrfs_file_extent_compression(leaf, extent); @@ -2893,11 +2939,20 @@ static int btrfs_clone(struct inode *src, struct inode *inode, datal = btrfs_file_extent_ram_bytes(leaf, extent); } - btrfs_release_path(path); if (key.offset + datal <= off || - key.offset >= off + len - 1) - goto next; + key.offset >= off + len - 1) { + path->slots[0]++; + goto process_slot; + } + + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + btrfs_release_path(path); + path->leave_spinning = 0; memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = btrfs_ino(inode); @@ -3068,7 +3123,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, } ret = btrfs_end_transaction(trans, root); } -next: btrfs_release_path(path); key.offset++; } @@ -3196,9 +3250,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); out_unlock: - mutex_unlock(&src->i_mutex); - if (!same_inode) - mutex_unlock(&inode->i_mutex); + if (!same_inode) { + if (inode < src) { + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); + } else { + mutex_unlock(&inode->i_mutex); + mutex_unlock(&src->i_mutex); + } + } else { + mutex_unlock(&src->i_mutex); + } out_fput: fdput(src_file); out_drop_write: @@ -3321,8 +3383,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) if (IS_ERR_OR_NULL(di)) { btrfs_free_path(path); btrfs_end_transaction(trans, root); - printk(KERN_ERR "Umm, you don't have the default dir item, " - "this isn't going to work\n"); + btrfs_err(new_root->fs_info, "Umm, you don't have the default dir" + "item, this isn't going to work"); ret = -ENOENT; goto out; } @@ -4303,6 +4365,9 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, int ret = 0; int received_uuid_changed; + if (!inode_owner_or_capable(inode)) + return -EPERM; + ret = mnt_want_write_file(file); if (ret < 0) return ret; @@ -4319,11 +4384,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, goto out; } - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out; - } - sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) { ret = PTR_ERR(sa); @@ -4409,8 +4469,8 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) len = strnlen(label, BTRFS_LABEL_SIZE); if (len == BTRFS_LABEL_SIZE) { - pr_warn("btrfs: label is too long, return the first %zu bytes\n", - --len); + btrfs_warn(root->fs_info, + "label is too long, return the first %zu bytes", --len); } ret = copy_to_user(arg, label, len); @@ -4433,7 +4493,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) return -EFAULT; if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { - pr_err("btrfs: unable to set label with more than %d bytes\n", + btrfs_err(root->fs_info, "unable to set label with more than %d bytes", BTRFS_LABEL_SIZE - 1); return -EINVAL; } @@ -4451,13 +4511,173 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) spin_lock(&root->fs_info->super_lock); strcpy(super_block->label, label); spin_unlock(&root->fs_info->super_lock); - ret = btrfs_end_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); out_unlock: mnt_drop_write_file(file); return ret; } +#define INIT_FEATURE_FLAGS(suffix) \ + { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \ + .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ + .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } + +static int btrfs_ioctl_get_supported_features(struct file *file, + void __user *arg) +{ + static struct btrfs_ioctl_feature_flags features[3] = { + INIT_FEATURE_FLAGS(SUPP), + INIT_FEATURE_FLAGS(SAFE_SET), + INIT_FEATURE_FLAGS(SAFE_CLEAR) + }; + + if (copy_to_user(arg, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static int btrfs_ioctl_get_features(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_ioctl_feature_flags features; + + features.compat_flags = btrfs_super_compat_flags(super_block); + features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block); + features.incompat_flags = btrfs_super_incompat_flags(super_block); + + if (copy_to_user(arg, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static int check_feature_bits(struct btrfs_root *root, + enum btrfs_feature_set set, + u64 change_mask, u64 flags, u64 supported_flags, + u64 safe_set, u64 safe_clear) +{ + const char *type = btrfs_feature_set_names[set]; + char *names; + u64 disallowed, unsupported; + u64 set_mask = flags & change_mask; + u64 clear_mask = ~flags & change_mask; + + unsupported = set_mask & ~supported_flags; + if (unsupported) { + names = btrfs_printable_features(set, unsupported); + if (names) { + btrfs_warn(root->fs_info, + "this kernel does not support the %s feature bit%s", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "this kernel does not support %s bits 0x%llx", + type, unsupported); + return -EOPNOTSUPP; + } + + disallowed = set_mask & ~safe_set; + if (disallowed) { + names = btrfs_printable_features(set, disallowed); + if (names) { + btrfs_warn(root->fs_info, + "can't set the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "can't set %s bits 0x%llx while mounted", + type, disallowed); + return -EPERM; + } + + disallowed = clear_mask & ~safe_clear; + if (disallowed) { + names = btrfs_printable_features(set, disallowed); + if (names) { + btrfs_warn(root->fs_info, + "can't clear the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); + kfree(names); + } else + btrfs_warn(root->fs_info, + "can't clear %s bits 0x%llx while mounted", + type, disallowed); + return -EPERM; + } + + return 0; +} + +#define check_feature(root, change_mask, flags, mask_base) \ +check_feature_bits(root, FEAT_##mask_base, change_mask, flags, \ + BTRFS_FEATURE_ ## mask_base ## _SUPP, \ + BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \ + BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR) + +static int btrfs_ioctl_set_features(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_ioctl_feature_flags flags[2]; + struct btrfs_trans_handle *trans; + u64 newflags; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(flags, arg, sizeof(flags))) + return -EFAULT; + + /* Nothing to do */ + if (!flags[0].compat_flags && !flags[0].compat_ro_flags && + !flags[0].incompat_flags) + return 0; + + ret = check_feature(root, flags[0].compat_flags, + flags[1].compat_flags, COMPAT); + if (ret) + return ret; + + ret = check_feature(root, flags[0].compat_ro_flags, + flags[1].compat_ro_flags, COMPAT_RO); + if (ret) + return ret; + + ret = check_feature(root, flags[0].incompat_flags, + flags[1].incompat_flags, INCOMPAT); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + spin_lock(&root->fs_info->super_lock); + newflags = btrfs_super_compat_flags(super_block); + newflags |= flags[0].compat_flags & flags[1].compat_flags; + newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags); + btrfs_set_super_compat_flags(super_block, newflags); + + newflags = btrfs_super_compat_ro_flags(super_block); + newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags; + newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags); + btrfs_set_super_compat_ro_flags(super_block, newflags); + + newflags = btrfs_super_incompat_flags(super_block); + newflags |= flags[0].incompat_flags & flags[1].incompat_flags; + newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags); + btrfs_set_super_incompat_flags(super_block, newflags); + spin_unlock(&root->fs_info->super_lock); + + return btrfs_commit_transaction(trans, root); +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -4576,6 +4796,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_fslabel(file, argp); case BTRFS_IOC_FILE_EXTENT_SAME: return btrfs_ioctl_file_extent_same(file, argp); + case BTRFS_IOC_GET_SUPPORTED_FEATURES: + return btrfs_ioctl_get_supported_features(file, argp); + case BTRFS_IOC_GET_FEATURES: + return btrfs_ioctl_get_features(file, argp); + case BTRFS_IOC_SET_FEATURES: + return btrfs_ioctl_set_features(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index b6a6f07c5ce2..b47f669aca75 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -141,7 +141,7 @@ static int lzo_compress_pages(struct list_head *ws, ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); if (ret != LZO_E_OK) { - printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); ret = -1; goto out; @@ -357,7 +357,7 @@ cont: if (need_unmap) kunmap(pages_in[page_in_index - 1]); if (ret != LZO_E_OK) { - printk(KERN_WARNING "btrfs decompress failed\n"); + printk(KERN_WARNING "BTRFS: decompress failed\n"); ret = -1; break; } @@ -401,7 +401,7 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, out_len = PAGE_CACHE_SIZE; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { - printk(KERN_WARNING "btrfs decompress failed!\n"); + printk(KERN_WARNING "BTRFS: decompress failed!\n"); ret = -1; goto out; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 69582d5b69d1..b16450b840e7 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -336,13 +336,14 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, entry->len); *file_offset = dec_end; if (dec_start > dec_end) { - printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n", - dec_start, dec_end); + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordering dec_start %llu end %llu", dec_start, dec_end); } to_dec = dec_end - dec_start; if (to_dec > entry->bytes_left) { - printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", - entry->bytes_left, to_dec); + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordered accounting left %llu size %llu", + entry->bytes_left, to_dec); } entry->bytes_left -= to_dec; if (!uptodate) @@ -401,7 +402,8 @@ have_entry: } if (io_size > entry->bytes_left) { - printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + btrfs_crit(BTRFS_I(inode)->root->fs_info, + "bad ordered accounting left %llu size %llu", entry->bytes_left, io_size); } entry->bytes_left -= io_size; @@ -520,7 +522,8 @@ void btrfs_remove_ordered_extent(struct inode *inode, spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); - tree->last = NULL; + if (tree->last == node) + tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); spin_unlock_irq(&tree->lock); diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 24cad1695af7..65793edb38ca 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -69,23 +69,3 @@ out: btrfs_free_path(path); return ret; } - -int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset) -{ - struct btrfs_path *path; - struct btrfs_key key; - int ret; - - key.objectid = BTRFS_ORPHAN_OBJECTID; - key.type = BTRFS_ORPHAN_ITEM_KEY; - key.offset = offset; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - - btrfs_free_path(path); - return ret; -} diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 417053b17181..6efd70d3b64f 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -154,7 +154,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, u32 item_size) { if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("btrfs: uuid item with illegal size %lu!\n", + pr_warn("BTRFS: uuid item with illegal size %lu!\n", (unsigned long)item_size); return; } @@ -249,7 +249,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) BTRFS_FILE_EXTENT_INLINE) { printk(KERN_INFO "\t\tinline extent data " "size %u\n", - btrfs_file_extent_inline_len(l, fi)); + btrfs_file_extent_inline_len(l, i, fi)); break; } printk(KERN_INFO "\t\textent data disk bytenr %llu " diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c new file mode 100644 index 000000000000..129b1dd28527 --- /dev/null +++ b/fs/btrfs/props.c @@ -0,0 +1,427 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/hashtable.h> +#include "props.h" +#include "btrfs_inode.h" +#include "hash.h" +#include "transaction.h" +#include "xattr.h" + +#define BTRFS_PROP_HANDLERS_HT_BITS 8 +static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); + +struct prop_handler { + struct hlist_node node; + const char *xattr_name; + int (*validate)(const char *value, size_t len); + int (*apply)(struct inode *inode, const char *value, size_t len); + const char *(*extract)(struct inode *inode); + int inheritable; +}; + +static int prop_compression_validate(const char *value, size_t len); +static int prop_compression_apply(struct inode *inode, + const char *value, + size_t len); +static const char *prop_compression_extract(struct inode *inode); + +static struct prop_handler prop_handlers[] = { + { + .xattr_name = XATTR_BTRFS_PREFIX "compression", + .validate = prop_compression_validate, + .apply = prop_compression_apply, + .extract = prop_compression_extract, + .inheritable = 1 + }, + { + .xattr_name = NULL + } +}; + +void __init btrfs_props_init(void) +{ + struct prop_handler *p; + + hash_init(prop_handlers_ht); + + for (p = &prop_handlers[0]; p->xattr_name; p++) { + u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); + + hash_add(prop_handlers_ht, &p->node, h); + } +} + +static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash) +{ + struct hlist_head *h; + + h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)]; + if (hlist_empty(h)) + return NULL; + + return h; +} + +static const struct prop_handler * +find_prop_handler(const char *name, + const struct hlist_head *handlers) +{ + struct prop_handler *h; + + if (!handlers) { + u64 hash = btrfs_name_hash(name, strlen(name)); + + handlers = find_prop_handlers_by_hash(hash); + if (!handlers) + return NULL; + } + + hlist_for_each_entry(h, handlers, node) + if (!strcmp(h->xattr_name, name)) + return h; + + return NULL; +} + +static int __btrfs_set_prop(struct btrfs_trans_handle *trans, + struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags) +{ + const struct prop_handler *handler; + int ret; + + if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN) + return -EINVAL; + + handler = find_prop_handler(name, NULL); + if (!handler) + return -EINVAL; + + if (value_len == 0) { + ret = __btrfs_setxattr(trans, inode, handler->xattr_name, + NULL, 0, flags); + if (ret) + return ret; + + ret = handler->apply(inode, NULL, 0); + ASSERT(ret == 0); + + return ret; + } + + ret = handler->validate(value, value_len); + if (ret) + return ret; + ret = __btrfs_setxattr(trans, inode, handler->xattr_name, + value, value_len, flags); + if (ret) + return ret; + ret = handler->apply(inode, value, value_len); + if (ret) { + __btrfs_setxattr(trans, inode, handler->xattr_name, + NULL, 0, flags); + return ret; + } + + set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); + + return 0; +} + +int btrfs_set_prop(struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags) +{ + return __btrfs_set_prop(NULL, inode, name, value, value_len, flags); +} + +static int iterate_object_props(struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, + void (*iterator)(void *, + const struct prop_handler *, + const char *, + size_t), + void *ctx) +{ + int ret; + char *name_buf = NULL; + char *value_buf = NULL; + int name_buf_len = 0; + int value_buf_len = 0; + + while (1) { + struct btrfs_key key; + struct btrfs_dir_item *di; + struct extent_buffer *leaf; + u32 total_len, cur, this_len; + int slot; + const struct hlist_head *handlers; + + slot = path->slots[0]; + leaf = path->nodes[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid) + break; + if (key.type != BTRFS_XATTR_ITEM_KEY) + break; + + handlers = find_prop_handlers_by_hash(key.offset); + if (!handlers) + goto next_slot; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + cur = 0; + total_len = btrfs_item_size_nr(leaf, slot); + + while (cur < total_len) { + u32 name_len = btrfs_dir_name_len(leaf, di); + u32 data_len = btrfs_dir_data_len(leaf, di); + unsigned long name_ptr, data_ptr; + const struct prop_handler *handler; + + this_len = sizeof(*di) + name_len + data_len; + name_ptr = (unsigned long)(di + 1); + data_ptr = name_ptr + name_len; + + if (name_len <= XATTR_BTRFS_PREFIX_LEN || + memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, + name_ptr, + XATTR_BTRFS_PREFIX_LEN)) + goto next_dir_item; + + if (name_len >= name_buf_len) { + kfree(name_buf); + name_buf_len = name_len + 1; + name_buf = kmalloc(name_buf_len, GFP_NOFS); + if (!name_buf) { + ret = -ENOMEM; + goto out; + } + } + read_extent_buffer(leaf, name_buf, name_ptr, name_len); + name_buf[name_len] = '\0'; + + handler = find_prop_handler(name_buf, handlers); + if (!handler) + goto next_dir_item; + + if (data_len > value_buf_len) { + kfree(value_buf); + value_buf_len = data_len; + value_buf = kmalloc(data_len, GFP_NOFS); + if (!value_buf) { + ret = -ENOMEM; + goto out; + } + } + read_extent_buffer(leaf, value_buf, data_ptr, data_len); + + iterator(ctx, handler, value_buf, data_len); +next_dir_item: + cur += this_len; + di = (struct btrfs_dir_item *)((char *) di + this_len); + } + +next_slot: + path->slots[0]++; + } + + ret = 0; +out: + btrfs_release_path(path); + kfree(name_buf); + kfree(value_buf); + + return ret; +} + +static void inode_prop_iterator(void *ctx, + const struct prop_handler *handler, + const char *value, + size_t len) +{ + struct inode *inode = ctx; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = handler->apply(inode, value, len); + if (unlikely(ret)) + btrfs_warn(root->fs_info, + "error applying prop %s to ino %llu (root %llu): %d", + handler->xattr_name, btrfs_ino(inode), + root->root_key.objectid, ret); + else + set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); +} + +int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 ino = btrfs_ino(inode); + int ret; + + ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode); + + return ret; +} + +static int inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *parent) +{ + const struct prop_handler *h; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (!test_bit(BTRFS_INODE_HAS_PROPS, + &BTRFS_I(parent)->runtime_flags)) + return 0; + + for (h = &prop_handlers[0]; h->xattr_name; h++) { + const char *value; + u64 num_bytes; + + if (!h->inheritable) + continue; + + value = h->extract(parent); + if (!value) + continue; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + ret = btrfs_block_rsv_add(root, trans->block_rsv, + num_bytes, BTRFS_RESERVE_NO_FLUSH); + if (ret) + goto out; + ret = __btrfs_set_prop(trans, inode, h->xattr_name, + value, strlen(value), 0); + btrfs_block_rsv_release(root, trans->block_rsv, num_bytes); + if (ret) + goto out; + } + ret = 0; +out: + return ret; +} + +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *dir) +{ + if (!dir) + return 0; + + return inherit_props(trans, inode, dir); +} + +int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *parent_root) +{ + struct btrfs_key key; + struct inode *parent_inode, *child_inode; + int ret; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + parent_inode = btrfs_iget(parent_root->fs_info->sb, &key, + parent_root, NULL); + if (IS_ERR(parent_inode)) + return PTR_ERR(parent_inode); + + child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + if (IS_ERR(child_inode)) { + iput(parent_inode); + return PTR_ERR(child_inode); + } + + ret = inherit_props(trans, child_inode, parent_inode); + iput(child_inode); + iput(parent_inode); + + return ret; +} + +static int prop_compression_validate(const char *value, size_t len) +{ + if (!strncmp("lzo", value, len)) + return 0; + else if (!strncmp("zlib", value, len)) + return 0; + + return -EINVAL; +} + +static int prop_compression_apply(struct inode *inode, + const char *value, + size_t len) +{ + int type; + + if (len == 0) { + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; + + return 0; + } + + if (!strncmp("lzo", value, len)) + type = BTRFS_COMPRESS_LZO; + else if (!strncmp("zlib", value, len)) + type = BTRFS_COMPRESS_ZLIB; + else + return -EINVAL; + + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->force_compress = type; + + return 0; +} + +static const char *prop_compression_extract(struct inode *inode) +{ + switch (BTRFS_I(inode)->force_compress) { + case BTRFS_COMPRESS_ZLIB: + return "zlib"; + case BTRFS_COMPRESS_LZO: + return "lzo"; + } + + return NULL; +} diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h new file mode 100644 index 000000000000..100f18829d50 --- /dev/null +++ b/fs/btrfs/props.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_PROPS_H +#define __BTRFS_PROPS_H + +#include "ctree.h" + +void __init btrfs_props_init(void); + +int btrfs_set_prop(struct inode *inode, + const char *name, + const char *value, + size_t value_len, + int flags); + +int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); + +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, + struct inode *dir); + +int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *parent_root); + +#endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4e6ef490619e..472302a2d745 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -301,16 +301,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) if (btrfs_qgroup_status_version(l, ptr) != BTRFS_QGROUP_STATUS_VERSION) { - printk(KERN_ERR - "btrfs: old qgroup version, quota disabled\n"); + btrfs_err(fs_info, + "old qgroup version, quota disabled"); goto out; } if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - printk(KERN_ERR - "btrfs: qgroup generation mismatch, " - "marked as inconsistent\n"); + btrfs_err(fs_info, + "qgroup generation mismatch, " + "marked as inconsistent"); } fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); @@ -325,7 +325,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = find_qgroup_rb(fs_info, found_key.offset); if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { - printk(KERN_ERR "btrfs: inconsitent qgroup config\n"); + btrfs_err(fs_info, "inconsitent qgroup config"); flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } if (!qgroup) { @@ -396,8 +396,8 @@ next1: ret = add_relation_rb(fs_info, found_key.objectid, found_key.offset); if (ret == -ENOENT) { - printk(KERN_WARNING - "btrfs: orphan qgroup relation 0x%llx->0x%llx\n", + btrfs_warn(fs_info, + "orphan qgroup relation 0x%llx->0x%llx", found_key.objectid, found_key.offset); ret = 0; /* ignore the error */ } @@ -644,8 +644,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; - qgroup_limit = btrfs_item_ptr(l, path->slots[0], - struct btrfs_qgroup_limit_item); + qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags); btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer); btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl); @@ -687,8 +686,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; - qgroup_info = btrfs_item_ptr(l, path->slots[0], - struct btrfs_qgroup_info_item); + qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); @@ -1161,7 +1159,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, limit->rsv_excl); if (ret) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - printk(KERN_INFO "unable to update quota limit for %llu\n", + btrfs_info(fs_info, "unable to update quota limit for %llu", qgroupid); } @@ -1349,7 +1347,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_key ins; struct btrfs_root *quota_root; u64 ref_root; struct btrfs_qgroup *qgroup; @@ -1363,10 +1360,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, BUG_ON(!fs_info->quota_root); - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) { struct btrfs_delayed_tree_ref *ref; @@ -1840,7 +1833,9 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) return; - pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n", + btrfs_err(trans->root->fs_info, + "qgroups not uptodate in trans handle %p: list is%s empty, " + "seq is %#x.%x", trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", (u32)(trans->delayed_ref_elem.seq >> 32), (u32)trans->delayed_ref_elem.seq); @@ -1902,9 +1897,17 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, mutex_unlock(&fs_info->qgroup_rescan_lock); for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { + u64 num_bytes; + btrfs_item_key_to_cpu(scratch_leaf, &found, slot); - if (found.type != BTRFS_EXTENT_ITEM_KEY) + if (found.type != BTRFS_EXTENT_ITEM_KEY && + found.type != BTRFS_METADATA_ITEM_KEY) continue; + if (found.type == BTRFS_METADATA_ITEM_KEY) + num_bytes = fs_info->extent_root->leafsize; + else + num_bytes = found.offset; + ret = btrfs_find_all_roots(trans, fs_info, found.objectid, tree_mod_seq_elem.seq, &roots); if (ret < 0) @@ -1949,12 +1952,12 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_qgroup_list *glist; qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux; - qg->rfer += found.offset; - qg->rfer_cmpr += found.offset; + qg->rfer += num_bytes; + qg->rfer_cmpr += num_bytes; WARN_ON(qg->tag >= seq); if (qg->refcnt - seq == roots->nnodes) { - qg->excl += found.offset; - qg->excl_cmpr += found.offset; + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; } qgroup_dirty(fs_info, qg); @@ -2037,10 +2040,10 @@ out: mutex_unlock(&fs_info->qgroup_rescan_lock); if (err >= 0) { - pr_info("btrfs: qgroup scan completed%s\n", + btrfs_info(fs_info, "qgroup scan completed%s", err == 2 ? " (inconsistency flag cleared)" : ""); } else { - pr_err("btrfs: qgroup scan failed with %d\n", err); + btrfs_err(fs_info, "qgroup scan failed with %d", err); } complete_all(&fs_info->qgroup_rescan_completion); @@ -2096,7 +2099,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, if (ret) { err: - pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret); + btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret); return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 24ac21840a9a..9af0b25d991a 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1032,8 +1032,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, /* see if we can add this page onto our existing bio */ if (last) { - last_end = (u64)last->bi_sector << 9; - last_end += last->bi_size; + last_end = (u64)last->bi_iter.bi_sector << 9; + last_end += last->bi_iter.bi_size; /* * we can't merge these if they are from different @@ -1053,9 +1053,9 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, if (!bio) return -ENOMEM; - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; bio->bi_bdev = stripe->dev->bdev; - bio->bi_sector = disk_start >> 9; + bio->bi_iter.bi_sector = disk_start >> 9; set_bit(BIO_UPTODATE, &bio->bi_flags); bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); @@ -1111,7 +1111,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) spin_lock_irq(&rbio->bio_list_lock); bio_list_for_each(bio, &rbio->bio_list) { - start = (u64)bio->bi_sector << 9; + start = (u64)bio->bi_iter.bi_sector << 9; stripe_offset = start - rbio->raid_map[0]; page_index = stripe_offset >> PAGE_CACHE_SHIFT; @@ -1272,7 +1272,7 @@ cleanup: static int find_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio) { - u64 physical = bio->bi_sector; + u64 physical = bio->bi_iter.bi_sector; u64 stripe_start; int i; struct btrfs_bio_stripe *stripe; @@ -1298,7 +1298,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio) { - u64 logical = bio->bi_sector; + u64 logical = bio->bi_iter.bi_sector; u64 stripe_start; int i; @@ -1602,8 +1602,8 @@ static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) plug_list); struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, plug_list); - u64 a_sector = ra->bio_list.head->bi_sector; - u64 b_sector = rb->bio_list.head->bi_sector; + u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; + u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; if (a_sector < b_sector) return -1; @@ -1691,7 +1691,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, if (IS_ERR(rbio)) return PTR_ERR(rbio); bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_size; + rbio->bio_list_bytes = bio->bi_iter.bi_size; /* * don't plug on full rbios, just get them out the door @@ -2044,7 +2044,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, rbio->read_rebuild = 1; bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_size; + rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 1031b69252c5..31c797c48c3e 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -189,8 +189,8 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, */ #ifdef DEBUG if (rec->generation != generation) { - printk(KERN_DEBUG "generation mismatch for " - "(%llu,%d,%llu) %llu != %llu\n", + btrfs_debug(root->fs_info, + "generation mismatch for (%llu,%d,%llu) %llu != %llu", key.objectid, key.type, key.offset, rec->generation, generation); } @@ -365,8 +365,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, goto error; if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { - printk(KERN_ERR "btrfs readahead: more than %d copies not " - "supported", BTRFS_MAX_MIRRORS); + btrfs_err(root->fs_info, + "readahead: more than %d copies not supported", + BTRFS_MAX_MIRRORS); goto error; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 429c73c374b8..07b3b36f40ee 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -94,6 +94,7 @@ struct backref_edge { #define LOWER 0 #define UPPER 1 +#define RELOCATION_RESERVED_NODES 256 struct backref_cache { /* red black tree of all backref nodes in the cache */ @@ -176,6 +177,8 @@ struct reloc_control { u64 merging_rsv_size; /* size of relocated tree nodes */ u64 nodes_relocated; + /* reserved size for block group relocation*/ + u64 reserved_bytes; u64 search_start; u64 extents_found; @@ -184,7 +187,6 @@ struct reloc_control { unsigned int create_reloc_tree:1; unsigned int merge_reloc_tree:1; unsigned int found_file_extent:1; - unsigned int commit_transaction:1; }; /* stages of data relocation */ @@ -2309,9 +2311,6 @@ void free_reloc_roots(struct list_head *list) reloc_root = list_entry(list->next, struct btrfs_root, root_list); __del_reloc_root(reloc_root); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - kfree(reloc_root); } } @@ -2353,10 +2352,9 @@ again: ret = merge_reloc_root(rc, root); if (ret) { - __del_reloc_root(reloc_root); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - kfree(reloc_root); + if (list_empty(&reloc_root->root_list)) + list_add_tail(&reloc_root->root_list, + &reloc_roots); goto out; } } else { @@ -2452,7 +2450,7 @@ static noinline_for_stack struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct backref_node *node, - struct backref_edge *edges[], int *nr) + struct backref_edge *edges[]) { struct backref_node *next; struct btrfs_root *root; @@ -2494,7 +2492,6 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, if (!root) return NULL; - *nr = index; next = node; /* setup backref node path for btrfs_reloc_cow_block */ while (1) { @@ -2590,28 +2587,36 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, struct btrfs_root *root = rc->extent_root; u64 num_bytes; int ret; + u64 tmp; num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, - BTRFS_RESERVE_FLUSH_ALL); + rc->reserved_bytes += num_bytes; + ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); if (ret) { - if (ret == -EAGAIN) - rc->commit_transaction = 1; + if (ret == -EAGAIN) { + tmp = rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) + tmp <<= 1; + /* + * only one thread can access block_rsv at this point, + * so we don't need hold lock to protect block_rsv. + * we expand more reservation size here to allow enough + * space for relocation and we will return eailer in + * enospc case. + */ + rc->block_rsv->size = tmp + rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + } return ret; } return 0; } -static void release_metadata_space(struct reloc_control *rc, - struct backref_node *node) -{ - u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2; - btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes); -} - /* * relocate a block tree, and then update pointers in upper level * blocks that reference the block to point to the new location. @@ -2633,7 +2638,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, u32 blocksize; u64 bytenr; u64 generation; - int nr; int slot; int ret; int err = 0; @@ -2646,7 +2650,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, cond_resched(); upper = edge->node[UPPER]; - root = select_reloc_root(trans, rc, upper, edges, &nr); + root = select_reloc_root(trans, rc, upper, edges); BUG_ON(!root); if (upper->eb && !upper->locked) { @@ -2898,7 +2902,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_root *root; - int release = 0; int ret = 0; if (!node) @@ -2915,7 +2918,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, ret = reserve_metadata_space(trans, rc, node); if (ret) goto out; - release = 1; } if (root) { @@ -2940,11 +2942,8 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, ret = do_relocation(trans, rc, node, key, path, 1); } out: - if (ret || node->level == 0 || node->cowonly) { - if (release) - release_metadata_space(rc, node); + if (ret || node->level == 0 || node->cowonly) remove_backref_node(&rc->backref_cache, node); - } return ret; } @@ -3867,29 +3866,20 @@ static noinline_for_stack int prepare_to_relocate(struct reloc_control *rc) { struct btrfs_trans_handle *trans; - int ret; rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root, BTRFS_BLOCK_RSV_TEMP); if (!rc->block_rsv) return -ENOMEM; - /* - * reserve some space for creating reloc trees. - * btrfs_init_reloc_root will use them when there - * is no reservation in transaction handle. - */ - ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, - rc->extent_root->nodesize * 256, - BTRFS_RESERVE_FLUSH_ALL); - if (ret) - return ret; - memset(&rc->cluster, 0, sizeof(rc->cluster)); rc->search_start = rc->block_group->key.objectid; rc->extents_found = 0; rc->nodes_relocated = 0; rc->merging_rsv_size = 0; + rc->reserved_bytes = 0; + rc->block_rsv->size = rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; rc->create_reloc_tree = 1; set_reloc_control(rc); @@ -3933,6 +3923,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } while (1) { + rc->reserved_bytes = 0; + ret = btrfs_block_rsv_refill(rc->extent_root, + rc->block_rsv, rc->block_rsv->size, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + err = ret; + break; + } progress++; trans = btrfs_start_transaction(rc->extent_root, 0); if (IS_ERR(trans)) { @@ -4011,6 +4009,12 @@ restart: if (!RB_EMPTY_ROOT(&blocks)) { ret = relocate_tree_blocks(trans, rc, &blocks); if (ret < 0) { + /* + * if we fail to relocate tree blocks, force to update + * backref cache when committing transaction. + */ + rc->backref_cache.last_trans = trans->transid - 1; + if (ret != -EAGAIN) { err = ret; break; @@ -4020,14 +4024,8 @@ restart: } } - if (rc->commit_transaction) { - rc->commit_transaction = 0; - ret = btrfs_commit_transaction(trans, rc->extent_root); - BUG_ON(ret); - } else { - btrfs_end_transaction_throttle(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root); - } + btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root); trans = NULL; if (rc->stage == MOVE_DATA_EXTENTS && @@ -4247,7 +4245,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) goto out; } - printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", + btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", rc->block_group->key.objectid, rc->block_group->flags); ret = btrfs_start_delalloc_roots(fs_info, 0); @@ -4269,7 +4267,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) if (rc->extents_found == 0) break; - printk(KERN_INFO "btrfs: found %llu extents\n", + btrfs_info(extent_root->fs_info, "found %llu extents", rc->extents_found); if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { @@ -4285,11 +4283,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) } } - filemap_write_and_wait_range(fs_info->btree_inode->i_mapping, - rc->block_group->key.objectid, - rc->block_group->key.objectid + - rc->block_group->key.offset - 1); - WARN_ON(rc->block_group->pinned > 0); WARN_ON(rc->block_group->reserved > 0); WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index ec71ea44d2b4..1389b69059de 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -44,7 +44,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, if (!need_reset && btrfs_root_generation(item) != btrfs_root_generation_v2(item)) { if (btrfs_root_generation_v2(item) != 0) { - printk(KERN_WARNING "btrfs: mismatching " + printk(KERN_WARNING "BTRFS: mismatching " "generation and generation_v2 " "found in root item. This root " "was probably mounted with an " @@ -154,7 +154,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (ret != 0) { btrfs_print_leaf(root, path->nodes[0]); - printk(KERN_CRIT "unable to update root key %llu %u %llu\n", + btrfs_crit(root->fs_info, "unable to update root key %llu %u %llu", key->objectid, key->type, key->offset); BUG_ON(1); } @@ -400,21 +400,6 @@ out: return err; } -int btrfs_find_root_ref(struct btrfs_root *tree_root, - struct btrfs_path *path, - u64 root_id, u64 ref_id) -{ - struct btrfs_key key; - int ret; - - key.objectid = root_id; - key.type = BTRFS_ROOT_REF_KEY; - key.offset = ref_id; - - ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); - return ret; -} - /* * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY * or BTRFS_ROOT_BACKREF_KEY. diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 1fd3f33c330a..efba5d1282ee 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -256,6 +256,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, int mirror_num, u64 physical_for_dev_replace); static void copy_nocow_pages_worker(struct btrfs_work *work); +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -269,6 +271,29 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx) wake_up(&sctx->list_wait); } +static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); + } +} + +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + + mutex_lock(&fs_info->scrub_lock); + __scrub_blocked_if_needed(fs_info); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + + wake_up(&fs_info->scrub_pause_wait); +} + /* * used for workers that require transaction commits (i.e., for the * NOCOW case) @@ -480,7 +505,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu, " "length %llu, links %u (path: %s)\n", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), @@ -492,7 +517,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, return 0; err: - printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu: path " "resolving failed with ret=%d\n", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), @@ -555,7 +580,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret = tree_backref_for_extent(&ptr, eb, ei, item_size, &ref_root, &ref_level); printk_in_rcu(KERN_WARNING - "btrfs: %s at logical %llu on dev %s, " + "BTRFS: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " "%llu\n", errstr, swarn.logical, rcu_str_deref(dev->name), @@ -704,13 +729,11 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) struct scrub_fixup_nodatasum *fixup; struct scrub_ctx *sctx; struct btrfs_trans_handle *trans = NULL; - struct btrfs_fs_info *fs_info; struct btrfs_path *path; int uncorrectable = 0; fixup = container_of(work, struct scrub_fixup_nodatasum, work); sctx = fixup->sctx; - fs_info = fixup->root->fs_info; path = btrfs_alloc_path(); if (!path) { @@ -759,8 +782,8 @@ out: btrfs_dev_replace_stats_inc( &sctx->dev_root->fs_info->dev_replace. num_uncorrectable_read_errors); - printk_ratelimited_in_rcu(KERN_ERR - "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", + printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " + "unable to fixup (nodatasum) error at logical %llu on dev %s\n", fixup->logical, rcu_str_deref(fixup->dev->name)); } @@ -1161,7 +1184,7 @@ corrected_error: sctx->stat.corrected_errors++; spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR - "btrfs: fixed up error at logical %llu on dev %s\n", + "BTRFS: fixed up error at logical %llu on dev %s\n", logical, rcu_str_deref(dev->name)); } } else { @@ -1170,7 +1193,7 @@ did_not_correct_error: sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR - "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", + "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n", logical, rcu_str_deref(dev->name)); } @@ -1308,7 +1331,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, continue; } bio->bi_bdev = page->dev->bdev; - bio->bi_sector = page->physical >> 9; + bio->bi_iter.bi_sector = page->physical >> 9; bio_add_page(bio, page->page, PAGE_SIZE, 0); if (btrfsic_submit_bio_wait(READ, bio)) @@ -1418,8 +1441,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, int ret; if (!page_bad->dev->bdev) { - printk_ratelimited(KERN_WARNING - "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); + printk_ratelimited(KERN_WARNING "BTRFS: " + "scrub_repair_page_from_good_copy(bdev == NULL) " + "is unexpected!\n"); return -EIO; } @@ -1427,7 +1451,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, if (!bio) return -EIO; bio->bi_bdev = page_bad->dev->bdev; - bio->bi_sector = page_bad->physical >> 9; + bio->bi_iter.bi_sector = page_bad->physical >> 9; ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); if (PAGE_SIZE != ret) { @@ -1520,7 +1544,7 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_wr_bio_end_io; bio->bi_bdev = sbio->dev->bdev; - bio->bi_sector = sbio->physical >> 9; + bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || @@ -1877,7 +1901,7 @@ static void scrub_submit(struct scrub_ctx *sctx) * This case is handled correctly (but _very_ slowly). */ printk_ratelimited(KERN_WARNING - "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); + "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n"); bio_endio(sbio->bio, -EIO); } else { btrfsic_submit_bio(READ, sbio->bio); @@ -1926,7 +1950,7 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sbio->dev->bdev; - bio->bi_sector = sbio->physical >> 9; + bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || @@ -2286,8 +2310,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); + scrub_blocked_if_needed(fs_info); /* FIXME it might be better to start readahead at commit root */ key_start.objectid = logical; @@ -2311,16 +2334,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (!IS_ERR(reada2)) btrfs_reada_wait(reada2); - mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); /* * collect all data csums for the stripe to avoid seeking during @@ -2357,22 +2370,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_set(&sctx->wr_ctx.flush_all_writes, 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); - mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); + scrub_blocked_if_needed(fs_info); } + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -2380,8 +2385,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, goto out; if (ret > 0) { - ret = btrfs_previous_item(root, path, 0, - BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_previous_extent_item(root, path, 0); if (ret < 0) goto out; if (ret > 0) { @@ -2439,9 +2443,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (key.objectid < logical && (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { - printk(KERN_ERR - "btrfs scrub: tree block %llu spanning " - "stripes, ignored. logical=%llu\n", + btrfs_err(fs_info, + "scrub: tree block %llu spanning " + "stripes, ignored. logical=%llu", key.objectid, logical); goto next; } @@ -2683,21 +2687,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_set(&sctx->wr_ctx.flush_all_writes, 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); - - mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); + scrub_blocked_if_needed(fs_info); btrfs_put_block_group(cache); if (ret) @@ -2823,8 +2815,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * check some assumptions */ if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { - printk(KERN_ERR - "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", + btrfs_err(fs_info, + "scrub: size assumption nodesize == leafsize (%d == %d) fails", fs_info->chunk_root->nodesize, fs_info->chunk_root->leafsize); return -EINVAL; @@ -2836,16 +2828,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * the way scrub is implemented. Do not handle this * situation at all because it won't ever happen. */ - printk(KERN_ERR - "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", + btrfs_err(fs_info, + "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); return -EINVAL; } if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ - printk(KERN_ERR - "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n", + btrfs_err(fs_info, + "scrub: size assumption sectorsize != PAGE_SIZE " + "(%d != %lu) fails", fs_info->chunk_root->sectorsize, PAGE_SIZE); return -EINVAL; } @@ -2858,7 +2851,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * would exhaust the array bounds of pagev member in * struct scrub_block */ - pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", + btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize " + "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", fs_info->chunk_root->nodesize, SCRUB_MAX_PAGES_PER_BLOCK, fs_info->chunk_root->sectorsize, @@ -2908,7 +2902,13 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } sctx->readonly = readonly; dev->scrub_device = sctx; + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + /* + * checking @scrub_pause_req here, we can avoid + * race between committing transaction and scrubbing. + */ + __scrub_blocked_if_needed(fs_info); atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); @@ -2917,9 +2917,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, * by holding device list mutex, we can * kick off writing super in log tree sync. */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); ret = scrub_supers(sctx, dev); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); } - mutex_unlock(&fs_info->fs_devices->device_list_mutex); if (!ret) ret = scrub_enumerate_chunks(sctx, dev, start, end, @@ -3167,7 +3168,8 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) ret = iterate_inodes_from_logical(logical, fs_info, path, record_inode_for_nocow, nocow_ctx); if (ret != 0 && ret != -ENOENT) { - pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n", + btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, " + "phys %llu, len %llu, mir %u, ret %d", logical, physical_for_dev_replace, len, mirror_num, ret); not_written = 1; @@ -3289,7 +3291,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, again: page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { - pr_err("find_or_create_page() failed\n"); + btrfs_err(fs_info, "find_or_create_page() failed"); ret = -ENOMEM; goto out; } @@ -3361,7 +3363,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, return -EIO; if (!dev->bdev) { printk_ratelimited(KERN_WARNING - "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); + "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); return -EIO; } bio = btrfs_io_bio_alloc(GFP_NOFS, 1); @@ -3371,8 +3373,8 @@ static int write_page_nocow(struct scrub_ctx *sctx, spin_unlock(&sctx->stat_lock); return -ENOMEM; } - bio->bi_size = 0; - bio->bi_sector = physical_for_dev_replace >> 9; + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); if (ret != PAGE_CACHE_SIZE) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 945d1db98f26..9dde9717c1b9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -24,12 +24,12 @@ #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> #include <linux/radix-tree.h> -#include <linux/crc32c.h> #include <linux/vmalloc.h> #include <linux/string.h> #include "send.h" #include "backref.h" +#include "hash.h" #include "locking.h" #include "disk-io.h" #include "btrfs_inode.h" @@ -88,8 +88,6 @@ struct send_ctx { u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ - struct vfsmount *mnt; - struct btrfs_root *send_root; struct btrfs_root *parent_root; struct clone_root *clone_roots; @@ -111,6 +109,7 @@ struct send_ctx { int cur_inode_deleted; u64 cur_inode_size; u64 cur_inode_mode; + u64 cur_inode_last_extent; u64 send_progress; @@ -122,6 +121,74 @@ struct send_ctx { int name_cache_size; char *read_buf; + + /* + * We process inodes by their increasing order, so if before an + * incremental send we reverse the parent/child relationship of + * directories such that a directory with a lower inode number was + * the parent of a directory with a higher inode number, and the one + * becoming the new parent got renamed too, we can't rename/move the + * directory with lower inode number when we finish processing it - we + * must process the directory with higher inode number first, then + * rename/move it and then rename/move the directory with lower inode + * number. Example follows. + * + * Tree state when the first send was performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * | + * | + * |-- c (ino 259) + * | |-- d (ino 260) + * | + * |-- c2 (ino 261) + * + * Tree state when the second (incremental) send is performed: + * + * . + * |-- a (ino 257) + * |-- b (ino 258) + * |-- c2 (ino 261) + * |-- d2 (ino 260) + * |-- cc (ino 259) + * + * The sequence of steps that lead to the second state was: + * + * mv /a/b/c/d /a/b/c2/d2 + * mv /a/b/c /a/b/c2/d2/cc + * + * "c" has lower inode number, but we can't move it (2nd mv operation) + * before we move "d", which has higher inode number. + * + * So we just memorize which move/rename operations must be performed + * later when their respective parent is processed and moved/renamed. + */ + + /* Indexed by parent directory inode number. */ + struct rb_root pending_dir_moves; + + /* + * Reverse index, indexed by the inode number of a directory that + * is waiting for the move/rename of its immediate parent before its + * own move/rename can be performed. + */ + struct rb_root waiting_dir_moves; +}; + +struct pending_dir_move { + struct rb_node node; + struct list_head list; + u64 parent_ino; + u64 ino; + u64 gen; + struct list_head update_refs; +}; + +struct waiting_dir_move { + struct rb_node node; + u64 ino; }; struct name_cache_entry { @@ -145,6 +212,15 @@ struct name_cache_entry { char name[]; }; +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); + +static int need_send_hole(struct send_ctx *sctx) +{ + return (sctx->parent_root && !sctx->cur_inode_new && + !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted && + S_ISREG(sctx->cur_inode_mode)); +} + static void fs_path_reset(struct fs_path *p) { if (p->reversed) { @@ -336,16 +412,6 @@ out: return ret; } -#if 0 -static void fs_path_remove(struct fs_path *p) -{ - BUG_ON(p->reversed); - while (p->start != p->end && *p->end != '/') - p->end--; - *p->end = 0; -} -#endif - static int fs_path_copy(struct fs_path *p, struct fs_path *from) { int ret; @@ -436,30 +502,15 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return 0; } -#if 0 -static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value) -{ - return tlv_put(sctx, attr, &value, sizeof(value)); -} - -static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value) -{ - __le16 tmp = cpu_to_le16(value); - return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} - -static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value) -{ - __le32 tmp = cpu_to_le32(value); - return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} -#endif +#define TLV_PUT_DEFINE_INT(bits) \ + static int tlv_put_u##bits(struct send_ctx *sctx, \ + u##bits attr, u##bits value) \ + { \ + __le##bits __tmp = cpu_to_le##bits(value); \ + return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ + } -static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value) -{ - __le64 tmp = cpu_to_le64(value); - return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} +TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, const char *str, int len) @@ -475,17 +526,6 @@ static int tlv_put_uuid(struct send_ctx *sctx, u16 attr, return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE); } -#if 0 -static int tlv_put_timespec(struct send_ctx *sctx, u16 attr, - struct timespec *ts) -{ - struct btrfs_timespec bts; - bts.sec = cpu_to_le64(ts->tv_sec); - bts.nsec = cpu_to_le32(ts->tv_nsec); - return tlv_put(sctx, attr, &bts, sizeof(bts)); -} -#endif - static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, struct extent_buffer *eb, struct btrfs_timespec *ts) @@ -533,12 +573,6 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, if (ret < 0) \ goto tlv_put_failure; \ } while (0) -#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \ - do { \ - ret = tlv_put_timespec(sctx, attrtype, ts); \ - if (ret < 0) \ - goto tlv_put_failure; \ - } while (0) #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \ do { \ ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ @@ -586,7 +620,7 @@ static int send_cmd(struct send_ctx *sctx) hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); hdr->crc = 0; - crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -1270,7 +1304,7 @@ static int find_extent_clone(struct send_ctx *sctx, if (!backref_ctx->found_itself) { /* found a bug in backref code? */ ret = -EIO; - printk(KERN_ERR "btrfs: ERROR did not find backref in " + btrfs_err(sctx->send_root->fs_info, "did not find backref in " "send_root. inode=%llu, offset=%llu, " "disk_byte=%llu found extent=%llu\n", ino, data_offset, disk_byte, found_key.objectid); @@ -1298,6 +1332,16 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " } if (cur_clone_root) { + if (compressed != BTRFS_COMPRESS_NONE) { + /* + * Offsets given by iterate_extent_inodes() are relative + * to the start of the extent, we need to add logical + * offset from the file extent item. + * (See why at backref.c:check_extent_in_eb()) + */ + cur_clone_root->offset += btrfs_file_extent_offset(eb, + fi); + } *found = cur_clone_root; ret = 0; } else { @@ -1343,7 +1387,7 @@ static int read_symlink(struct btrfs_root *root, BUG_ON(compression); off = btrfs_file_extent_inline_start(ei); - len = btrfs_file_extent_inline_len(path->nodes[0], ei); + len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei); ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); @@ -1372,7 +1416,7 @@ static int gen_unique_name(struct send_ctx *sctx, return -ENOMEM; while (1) { - len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu", + len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", ino, gen, idx); if (len >= sizeof(tmp)) { /* should really not happen */ @@ -1933,6 +1977,7 @@ static void name_cache_free(struct send_ctx *sctx) */ static int __get_cur_name_and_parent(struct send_ctx *sctx, u64 ino, u64 gen, + int skip_name_cache, u64 *parent_ino, u64 *parent_gen, struct fs_path *dest) @@ -1942,6 +1987,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, struct btrfs_path *path = NULL; struct name_cache_entry *nce = NULL; + if (skip_name_cache) + goto get_ref; /* * First check if we already did a call to this function with the same * ino/gen. If yes, check if the cache entry is still up-to-date. If yes @@ -1986,11 +2033,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, goto out_cache; } +get_ref: /* * Depending on whether the inode was already processed or not, use * send_root or parent_root for ref lookup. */ - if (ino < sctx->send_progress) + if (ino < sctx->send_progress && !skip_name_cache) ret = get_first_ref(sctx->send_root, ino, parent_ino, parent_gen, dest); else @@ -2014,6 +2062,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, goto out; ret = 1; } + if (skip_name_cache) + goto out; out_cache: /* @@ -2081,6 +2131,9 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_inode = 0; u64 parent_gen = 0; int stop = 0; + u64 start_ino = ino; + u64 start_gen = gen; + int skip_name_cache = 0; name = fs_path_alloc(); if (!name) { @@ -2088,19 +2141,32 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, goto out; } + if (is_waiting_for_move(sctx, ino)) + skip_name_cache = 1; + +again: dest->reversed = 1; fs_path_reset(dest); while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { fs_path_reset(name); - ret = __get_cur_name_and_parent(sctx, ino, gen, + ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache, &parent_inode, &parent_gen, name); if (ret < 0) goto out; if (ret) stop = 1; + if (!skip_name_cache && + is_waiting_for_move(sctx, parent_inode)) { + ino = start_ino; + gen = start_gen; + stop = 0; + skip_name_cache = 1; + goto again; + } + ret = fs_path_add_path(dest, name); if (ret < 0) goto out; @@ -2131,7 +2197,7 @@ static int send_subvol_begin(struct send_ctx *sctx) char *name = NULL; int namelen; - path = alloc_path_for_send(); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2180,12 +2246,12 @@ static int send_subvol_begin(struct send_ctx *sctx) TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, sctx->send_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, - sctx->send_root->root_item.ctransid); + le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, sctx->parent_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - sctx->parent_root->root_item.ctransid); + le64_to_cpu(sctx->parent_root->root_item.ctransid)); } ret = send_cmd(sctx); @@ -2672,10 +2738,347 @@ out: return ret; } +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node *n = sctx->waiting_dir_moves.rb_node; + struct waiting_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct waiting_dir_move, node); + if (ino < entry->ino) + n = n->rb_left; + else if (ino > entry->ino) + n = n->rb_right; + else + return 1; + } + return 0; +} + +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node **p = &sctx->waiting_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct waiting_dir_move *entry, *dm; + + dm = kmalloc(sizeof(*dm), GFP_NOFS); + if (!dm) + return -ENOMEM; + dm->ino = ino; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct waiting_dir_move, node); + if (ino < entry->ino) { + p = &(*p)->rb_left; + } else if (ino > entry->ino) { + p = &(*p)->rb_right; + } else { + kfree(dm); + return -EEXIST; + } + } + + rb_link_node(&dm->node, parent, p); + rb_insert_color(&dm->node, &sctx->waiting_dir_moves); + return 0; +} + +static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ + struct rb_node *n = sctx->waiting_dir_moves.rb_node; + struct waiting_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct waiting_dir_move, node); + if (ino < entry->ino) { + n = n->rb_left; + } else if (ino > entry->ino) { + n = n->rb_right; + } else { + rb_erase(&entry->node, &sctx->waiting_dir_moves); + kfree(entry); + return 0; + } + } + return -ENOENT; +} + +static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino) +{ + struct rb_node **p = &sctx->pending_dir_moves.rb_node; + struct rb_node *parent = NULL; + struct pending_dir_move *entry, *pm; + struct recorded_ref *cur; + int exists = 0; + int ret; + + pm = kmalloc(sizeof(*pm), GFP_NOFS); + if (!pm) + return -ENOMEM; + pm->parent_ino = parent_ino; + pm->ino = sctx->cur_ino; + pm->gen = sctx->cur_inode_gen; + INIT_LIST_HEAD(&pm->list); + INIT_LIST_HEAD(&pm->update_refs); + RB_CLEAR_NODE(&pm->node); + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) { + p = &(*p)->rb_left; + } else if (parent_ino > entry->parent_ino) { + p = &(*p)->rb_right; + } else { + exists = 1; + break; + } + } + + list_for_each_entry(cur, &sctx->deleted_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + list_for_each_entry(cur, &sctx->new_refs, list) { + ret = dup_ref(cur, &pm->update_refs); + if (ret < 0) + goto out; + } + + ret = add_waiting_dir_move(sctx, pm->ino); + if (ret) + goto out; + + if (exists) { + list_add_tail(&pm->list, &entry->list); + } else { + rb_link_node(&pm->node, parent, p); + rb_insert_color(&pm->node, &sctx->pending_dir_moves); + } + ret = 0; +out: + if (ret) { + __free_recorded_refs(&pm->update_refs); + kfree(pm); + } + return ret; +} + +static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, + u64 parent_ino) +{ + struct rb_node *n = sctx->pending_dir_moves.rb_node; + struct pending_dir_move *entry; + + while (n) { + entry = rb_entry(n, struct pending_dir_move, node); + if (parent_ino < entry->parent_ino) + n = n->rb_left; + else if (parent_ino > entry->parent_ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) +{ + struct fs_path *from_path = NULL; + struct fs_path *to_path = NULL; + u64 orig_progress = sctx->send_progress; + struct recorded_ref *cur; + int ret; + + from_path = fs_path_alloc(); + if (!from_path) + return -ENOMEM; + + sctx->send_progress = pm->ino; + ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); + if (ret < 0) + goto out; + + to_path = fs_path_alloc(); + if (!to_path) { + ret = -ENOMEM; + goto out; + } + + sctx->send_progress = sctx->cur_ino + 1; + ret = del_waiting_dir_move(sctx, pm->ino); + ASSERT(ret == 0); + + ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); + if (ret < 0) + goto out; + + ret = send_rename(sctx, from_path, to_path); + if (ret < 0) + goto out; + + ret = send_utimes(sctx, pm->ino, pm->gen); + if (ret < 0) + goto out; + + /* + * After rename/move, need to update the utimes of both new parent(s) + * and old parent(s). + */ + list_for_each_entry(cur, &pm->update_refs, list) { + ret = send_utimes(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + } + +out: + fs_path_free(from_path); + fs_path_free(to_path); + sctx->send_progress = orig_progress; + + return ret; +} + +static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) +{ + if (!list_empty(&m->list)) + list_del(&m->list); + if (!RB_EMPTY_NODE(&m->node)) + rb_erase(&m->node, &sctx->pending_dir_moves); + __free_recorded_refs(&m->update_refs); + kfree(m); +} + +static void tail_append_pending_moves(struct pending_dir_move *moves, + struct list_head *stack) +{ + if (list_empty(&moves->list)) { + list_add_tail(&moves->list, stack); + } else { + LIST_HEAD(list); + list_splice_init(&moves->list, &list); + list_add_tail(&moves->list, stack); + list_splice_tail(&list, stack); + } +} + +static int apply_children_dir_moves(struct send_ctx *sctx) +{ + struct pending_dir_move *pm; + struct list_head stack; + u64 parent_ino = sctx->cur_ino; + int ret = 0; + + pm = get_pending_dir_moves(sctx, parent_ino); + if (!pm) + return 0; + + INIT_LIST_HEAD(&stack); + tail_append_pending_moves(pm, &stack); + + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + parent_ino = pm->ino; + ret = apply_dir_move(sctx, pm); + free_pending_move(sctx, pm); + if (ret) + goto out; + pm = get_pending_dir_moves(sctx, parent_ino); + if (pm) + tail_append_pending_moves(pm, &stack); + } + return 0; + +out: + while (!list_empty(&stack)) { + pm = list_first_entry(&stack, struct pending_dir_move, list); + free_pending_move(sctx, pm); + } + return ret; +} + +static int wait_for_parent_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref) +{ + int ret; + u64 ino = parent_ref->dir; + u64 parent_ino_before, parent_ino_after; + u64 new_gen, old_gen; + struct fs_path *path_before = NULL; + struct fs_path *path_after = NULL; + int len1, len2; + + if (parent_ref->dir <= sctx->cur_ino) + return 0; + + if (is_waiting_for_move(sctx, ino)) + return 1; + + ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, + NULL, NULL, NULL, NULL); + if (ret == -ENOENT) + return 0; + else if (ret < 0) + return ret; + + ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen, + NULL, NULL, NULL, NULL); + if (ret < 0) + return ret; + + if (new_gen != old_gen) + return 0; + + path_before = fs_path_alloc(); + if (!path_before) + return -ENOMEM; + + ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, + NULL, path_before); + if (ret == -ENOENT) { + ret = 0; + goto out; + } else if (ret < 0) { + goto out; + } + + path_after = fs_path_alloc(); + if (!path_after) { + ret = -ENOMEM; + goto out; + } + + ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, + NULL, path_after); + if (ret == -ENOENT) { + ret = 0; + goto out; + } else if (ret < 0) { + goto out; + } + + len1 = fs_path_len(path_before); + len2 = fs_path_len(path_after); + if ((parent_ino_before != parent_ino_after) && (len1 != len2 || + memcmp(path_before->start, path_after->start, len1))) { + ret = 1; + goto out; + } + ret = 0; + +out: + fs_path_free(path_before); + fs_path_free(path_after); + + return ret; +} + /* * This does all the move/link/unlink/rmdir magic. */ -static int process_recorded_refs(struct send_ctx *sctx) +static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) { int ret = 0; struct recorded_ref *cur; @@ -2824,11 +3227,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = send_rename(sctx, valid_path, - cur->full_path); - if (ret < 0) - goto out; - ret = fs_path_copy(valid_path, cur->full_path); + if (wait_for_parent_move(sctx, cur)) { + ret = add_pending_dir_move(sctx, + cur->dir); + *pending_move = 1; + } else { + ret = send_rename(sctx, valid_path, + cur->full_path); + if (!ret) + ret = fs_path_copy(valid_path, + cur->full_path); + } if (ret < 0) goto out; } else { @@ -3197,6 +3606,7 @@ static int process_all_refs(struct send_ctx *sctx, struct extent_buffer *eb; int slot; iterate_inode_ref_t cb; + int pending_move = 0; path = alloc_path_for_send(); if (!path) @@ -3240,7 +3650,9 @@ static int process_all_refs(struct send_ctx *sctx, } btrfs_release_path(path); - ret = process_recorded_refs(sctx); + ret = process_recorded_refs(sctx, &pending_move); + /* Only applicable to an incremental send. */ + ASSERT(pending_move == 0); out: btrfs_free_path(path); @@ -3706,7 +4118,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - clone_root->root->root_item.ctransid); + le64_to_cpu(clone_root->root->root_item.ctransid)); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, clone_root->offset); @@ -3752,6 +4164,39 @@ out: return ret; } +static int send_hole(struct send_ctx *sctx, u64 end) +{ + struct fs_path *p = NULL; + u64 offset = sctx->cur_inode_last_extent; + u64 len; + int ret = 0; + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); + while (offset < end) { + len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); + if (ret < 0) + break; + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + break; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = send_cmd(sctx); + if (ret < 0) + break; + offset += len; + } +tlv_put_failure: + fs_path_free(p); + return ret; +} + static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key, @@ -3764,12 +4209,14 @@ static int send_write_or_clone(struct send_ctx *sctx, u64 len; u32 l; u8 type; + u64 bs = sctx->send_root->fs_info->sb->s_blocksize; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], ei); if (type == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(path->nodes[0], ei); + len = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], ei); /* * it is possible the inline item won't cover the whole page, * but there may be items after this page. Make @@ -3787,7 +4234,7 @@ static int send_write_or_clone(struct send_ctx *sctx, goto out; } - if (clone_root) { + if (clone_root && IS_ALIGNED(offset + len, bs)) { ret = send_clone(sctx, offset, len, clone_root); } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { ret = send_update_extent(sctx, offset, len); @@ -3979,6 +4426,101 @@ out: return ret; } +static int get_last_extent(struct send_ctx *sctx, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_root *root = sctx->send_root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 extent_end; + u8 type; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + sctx->cur_inode_last_extent = 0; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = offset; + ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); + if (ret < 0) + goto out; + ret = 0; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) + goto out; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], fi); + if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], fi); + extent_end = ALIGN(key.offset + size, + sctx->send_root->sectorsize); + } else { + extent_end = key.offset + + btrfs_file_extent_num_bytes(path->nodes[0], fi); + } + sctx->cur_inode_last_extent = extent_end; +out: + btrfs_free_path(path); + return ret; +} + +static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_file_extent_item *fi; + u64 extent_end; + u8 type; + int ret = 0; + + if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) + return 0; + + if (sctx->cur_inode_last_extent == (u64)-1) { + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], fi); + if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(path->nodes[0], + path->slots[0], fi); + extent_end = ALIGN(key->offset + size, + sctx->send_root->sectorsize); + } else { + extent_end = key->offset + + btrfs_file_extent_num_bytes(path->nodes[0], fi); + } + + if (path->slots[0] == 0 && + sctx->cur_inode_last_extent < key->offset) { + /* + * We might have skipped entire leafs that contained only + * file extent items for our current inode. These leafs have + * a generation number smaller (older) than the one in the + * current leaf and the leaf our last extent came from, and + * are located between these 2 leafs. + */ + ret = get_last_extent(sctx, key->offset - 1); + if (ret) + return ret; + } + + if (sctx->cur_inode_last_extent < key->offset) + ret = send_hole(sctx, key->offset); + sctx->cur_inode_last_extent = extent_end; + return ret; +} + static int process_extent(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) @@ -3995,7 +4537,7 @@ static int process_extent(struct send_ctx *sctx, goto out; if (ret) { ret = 0; - goto out; + goto out_hole; } } else { struct btrfs_file_extent_item *ei; @@ -4031,7 +4573,10 @@ static int process_extent(struct send_ctx *sctx, goto out; ret = send_write_or_clone(sctx, path, key, found_clone); - + if (ret) + goto out; +out_hole: + ret = maybe_send_hole(sctx, path, key); out: return ret; } @@ -4054,17 +4599,25 @@ static int process_all_extents(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { eb = path->nodes[0]; slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || @@ -4077,8 +4630,7 @@ static int process_all_extents(struct send_ctx *sctx) if (ret < 0) goto out; - btrfs_release_path(path); - key.offset = found_key.offset + 1; + path->slots[0]++; } out: @@ -4086,7 +4638,9 @@ out: return ret; } -static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) +static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, + int *pending_move, + int *refs_processed) { int ret = 0; @@ -4098,17 +4652,11 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) goto out; - ret = process_recorded_refs(sctx); + ret = process_recorded_refs(sctx, pending_move); if (ret < 0) goto out; - /* - * We have processed the refs and thus need to advance send_progress. - * Now, calls to get_cur_xxx will take the updated refs of the current - * inode into account. - */ - sctx->send_progress = sctx->cur_ino + 1; - + *refs_processed = 1; out: return ret; } @@ -4124,11 +4672,29 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) u64 right_gid; int need_chmod = 0; int need_chown = 0; + int pending_move = 0; + int refs_processed = 0; - ret = process_recorded_refs_if_needed(sctx, at_end); + ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move, + &refs_processed); if (ret < 0) goto out; + /* + * We have processed the refs and thus need to advance send_progress. + * Now, calls to get_cur_xxx will take the updated refs of the current + * inode into account. + * + * On the other hand, if our current inode is a directory and couldn't + * be moved/renamed because its parent was renamed/moved too and it has + * a higher inode number, we can only move/rename our current inode + * after we moved/renamed its parent. Therefore in this case operate on + * the old path (pre move/rename) of our current inode, and the + * move/rename will be performed later. + */ + if (refs_processed && !pending_move) + sctx->send_progress = sctx->cur_ino + 1; + if (sctx->cur_ino == 0 || sctx->cur_inode_deleted) goto out; if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) @@ -4157,6 +4723,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) } if (S_ISREG(sctx->cur_inode_mode)) { + if (need_send_hole(sctx)) { + if (sctx->cur_inode_last_extent == (u64)-1) { + ret = get_last_extent(sctx, (u64)-1); + if (ret) + goto out; + } + if (sctx->cur_inode_last_extent < + sctx->cur_inode_size) { + ret = send_hole(sctx, sctx->cur_inode_size); + if (ret) + goto out; + } + } ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen, sctx->cur_inode_size); if (ret < 0) @@ -4177,9 +4756,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) } /* - * Need to send that every time, no matter if it actually changed - * between the two trees as we have done changes to the inode before. + * If other directory inodes depended on our current directory + * inode's move/rename, now do their move/rename operations. + */ + if (!is_waiting_for_move(sctx, sctx->cur_ino)) { + ret = apply_children_dir_moves(sctx); + if (ret) + goto out; + } + + /* + * Need to send that every time, no matter if it actually + * changed between the two trees as we have done changes to + * the inode before. */ + sctx->send_progress = sctx->cur_ino + 1; ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); if (ret < 0) goto out; @@ -4200,6 +4791,7 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_ino = key->objectid; sctx->cur_inode_new_gen = 0; + sctx->cur_inode_last_extent = (u64)-1; /* * Set send_progress to current inode. This will tell all get_cur_xxx @@ -4480,14 +5072,18 @@ static int changed_cb(struct btrfs_root *left_root, struct send_ctx *sctx = ctx; if (result == BTRFS_COMPARE_TREE_SAME) { - if (key->type != BTRFS_INODE_REF_KEY && - key->type != BTRFS_INODE_EXTREF_KEY) - return 0; - ret = compare_refs(sctx, left_path, key); - if (!ret) + if (key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY) { + ret = compare_refs(sctx, left_path, key); + if (!ret) + return 0; + if (ret < 0) + return ret; + } else if (key->type == BTRFS_EXTENT_DATA_KEY) { + return maybe_send_hole(sctx, left_path, key); + } else { return 0; - if (ret < 0) - return ret; + } result = BTRFS_COMPARE_TREE_CHANGED; ret = 0; } @@ -4522,7 +5118,6 @@ out: static int full_send_tree(struct send_ctx *sctx) { int ret; - struct btrfs_trans_handle *trans = NULL; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; struct btrfs_key found_key; @@ -4544,19 +5139,6 @@ static int full_send_tree(struct send_ctx *sctx) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; -join_trans: - /* - * We need to make sure the transaction does not get committed - * while we do anything on commit roots. Join a transaction to prevent - * this. - */ - trans = btrfs_join_transaction(send_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - /* * Make sure the tree has not changed after re-joining. We detect this * by comparing start_ctransid and ctransid. They should always match. @@ -4566,7 +5148,7 @@ join_trans: spin_unlock(&send_root->root_item_lock); if (ctransid != start_ctransid) { - WARN(1, KERN_WARNING "btrfs: the root that you're trying to " + WARN(1, KERN_WARNING "BTRFS: the root that you're trying to " "send was modified in between. This is " "probably a bug.\n"); ret = -EIO; @@ -4580,19 +5162,6 @@ join_trans: goto out_finish; while (1) { - /* - * When someone want to commit while we iterate, end the - * joined transaction and rejoin. - */ - if (btrfs_should_end_transaction(trans, send_root)) { - ret = btrfs_end_transaction(trans, send_root); - trans = NULL; - if (ret < 0) - goto out; - btrfs_release_path(path); - goto join_trans; - } - eb = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); @@ -4620,12 +5189,6 @@ out_finish: out: btrfs_free_path(path); - if (trans) { - if (!ret) - ret = btrfs_end_transaction(trans, send_root); - else - btrfs_end_transaction(trans, send_root); - } return ret; } @@ -4662,6 +5225,21 @@ out: return ret; } +static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) +{ + spin_lock(&root->root_item_lock); + root->send_in_progress--; + /* + * Not much left to do, we don't know why it's unbalanced and + * can't blindly reset it to 0. + */ + if (root->send_in_progress < 0) + btrfs_err(root->fs_info, + "send_in_progres unbalanced %d root %llu\n", + root->send_in_progress, root->root_key.objectid); + spin_unlock(&root->root_item_lock); +} + long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) { int ret = 0; @@ -4673,6 +5251,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) struct send_ctx *sctx = NULL; u32 i; u64 *clone_sources_tmp = NULL; + int clone_sources_to_rollback = 0; + int sort_clone_roots = 0; + int index; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4681,38 +5262,26 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) fs_info = send_root->fs_info; /* + * The subvolume must remain read-only during send, protect against + * making it RW. + */ + spin_lock(&send_root->root_item_lock); + send_root->send_in_progress++; + spin_unlock(&send_root->root_item_lock); + + /* * This is done when we lookup the root, it should already be complete * by the time we get here. */ WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); /* - * If we just created this root we need to make sure that the orphan - * cleanup has been done and committed since we search the commit root, - * so check its commit root transid with our otransid and if they match - * commit the transaction to make sure everything is updated. + * Userspace tools do the checks and warn the user if it's + * not RO. */ - down_read(&send_root->fs_info->extent_commit_sem); - if (btrfs_header_generation(send_root->commit_root) == - btrfs_root_otransid(&send_root->root_item)) { - struct btrfs_trans_handle *trans; - - up_read(&send_root->fs_info->extent_commit_sem); - - trans = btrfs_attach_transaction_barrier(send_root); - if (IS_ERR(trans)) { - if (PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto out; - } - /* ENOENT means theres no transaction */ - } else { - ret = btrfs_commit_transaction(trans, send_root); - if (ret) - goto out; - } - } else { - up_read(&send_root->fs_info->extent_commit_sem); + if (!btrfs_root_readonly(send_root)) { + ret = -EPERM; + goto out; } arg = memdup_user(arg_, sizeof(*arg)); @@ -4753,8 +5322,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } - sctx->mnt = mnt_file->f_path.mnt; - sctx->send_root = send_root; sctx->clone_roots_cnt = arg->clone_sources_count; @@ -4771,6 +5338,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + sctx->pending_dir_moves = RB_ROOT; + sctx->waiting_dir_moves = RB_ROOT; + sctx->clone_roots = vzalloc(sizeof(struct clone_root) * (arg->clone_sources_count + 1)); if (!sctx->clone_roots) { @@ -4798,11 +5368,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.objectid = clone_sources_tmp[i]; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + clone_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(clone_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(clone_root); goto out; } + clone_sources_to_rollback = i + 1; + spin_lock(&clone_root->root_item_lock); + clone_root->send_in_progress++; + if (!btrfs_root_readonly(clone_root)) { + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EPERM; + goto out; + } + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + sctx->clone_roots[i].root = clone_root; } vfree(clone_sources_tmp); @@ -4813,11 +5399,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.objectid = arg->parent_root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(sctx->parent_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(sctx->parent_root); goto out; } + + spin_lock(&sctx->parent_root->root_item_lock); + sctx->parent_root->send_in_progress++; + if (!btrfs_root_readonly(sctx->parent_root)) { + spin_unlock(&sctx->parent_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EPERM; + goto out; + } + spin_unlock(&sctx->parent_root->root_item_lock); + + srcu_read_unlock(&fs_info->subvol_srcu, index); } /* @@ -4831,6 +5433,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sort(sctx->clone_roots, sctx->clone_roots_cnt, sizeof(*sctx->clone_roots), __clone_root_cmp_sort, NULL); + sort_clone_roots = 1; ret = send_subvol(sctx); if (ret < 0) @@ -4846,6 +5449,48 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } out: + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) { + struct rb_node *n; + struct pending_dir_move *pm; + + n = rb_first(&sctx->pending_dir_moves); + pm = rb_entry(n, struct pending_dir_move, node); + while (!list_empty(&pm->list)) { + struct pending_dir_move *pm2; + + pm2 = list_first_entry(&pm->list, + struct pending_dir_move, list); + free_pending_move(sctx, pm2); + } + free_pending_move(sctx, pm); + } + + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)); + while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) { + struct rb_node *n; + struct waiting_dir_move *dm; + + n = rb_first(&sctx->waiting_dir_moves); + dm = rb_entry(n, struct waiting_dir_move, node); + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); + } + + if (sort_clone_roots) { + for (i = 0; i < sctx->clone_roots_cnt; i++) + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + } else { + for (i = 0; sctx && i < clone_sources_to_rollback; i++) + btrfs_root_dec_send_in_progress( + sctx->clone_roots[i].root); + + btrfs_root_dec_send_in_progress(send_root); + } + if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) + btrfs_root_dec_send_in_progress(sctx->parent_root); + kfree(arg); vfree(clone_sources_tmp); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d71a11d13dfa..d04db817be5c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -48,6 +48,8 @@ #include "transaction.h" #include "btrfs_inode.h" #include "print-tree.h" +#include "hash.h" +#include "props.h" #include "xattr.h" #include "volumes.h" #include "export.h" @@ -152,11 +154,12 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s (%pV)\n", + printk(KERN_CRIT + "BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n", sb->s_id, function, line, errno, errstr, &vaf); va_end(args); } else { - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s\n", + printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n", sb->s_id, function, line, errno, errstr); } @@ -250,7 +253,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, */ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, &root->fs_info->fs_state)) { - WARN(1, KERN_DEBUG "btrfs: Transaction aborted (error %d)\n", + WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n", errno); } trans->aborted = errno; @@ -294,8 +297,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", s_id, function, line, &vaf, errno, errstr); - printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", - s_id, function, line, &vaf, errno, errstr); + btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", + function, line, &vaf, errno, errstr); va_end(args); /* Caller calls BUG() */ } @@ -322,7 +325,9 @@ enum { Opt_no_space_cache, Opt_recovery, Opt_skip_balance, Opt_check_integrity, Opt_check_integrity_including_extent_data, Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, - Opt_commit_interval, + Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, + Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, + Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_err, }; @@ -332,8 +337,11 @@ static match_table_t tokens = { {Opt_subvolid, "subvolid=%s"}, {Opt_device, "device=%s"}, {Opt_nodatasum, "nodatasum"}, + {Opt_datasum, "datasum"}, {Opt_nodatacow, "nodatacow"}, + {Opt_datacow, "datacow"}, {Opt_nobarrier, "nobarrier"}, + {Opt_barrier, "barrier"}, {Opt_max_inline, "max_inline=%s"}, {Opt_alloc_start, "alloc_start=%s"}, {Opt_thread_pool, "thread_pool=%d"}, @@ -344,18 +352,25 @@ static match_table_t tokens = { {Opt_ssd, "ssd"}, {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd, "nossd"}, + {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_notreelog, "notreelog"}, + {Opt_treelog, "treelog"}, {Opt_flushoncommit, "flushoncommit"}, + {Opt_noflushoncommit, "noflushoncommit"}, {Opt_ratio, "metadata_ratio=%d"}, {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_space_cache, "space_cache"}, {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, {Opt_enospc_debug, "enospc_debug"}, + {Opt_noenospc_debug, "noenospc_debug"}, {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, + {Opt_nodefrag, "noautodefrag"}, {Opt_inode_cache, "inode_cache"}, + {Opt_noinode_cache, "noinode_cache"}, {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, {Opt_skip_balance, "skip_balance"}, @@ -368,6 +383,20 @@ static match_table_t tokens = { {Opt_err, NULL}, }; +#define btrfs_set_and_info(root, opt, fmt, args...) \ +{ \ + if (!btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_set_opt(root->fs_info->mount_opt, opt); \ +} + +#define btrfs_clear_and_info(root, opt, fmt, args...) \ +{ \ + if (btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_clear_opt(root->fs_info->mount_opt, opt); \ +} + /* * Regular mount options parser. Everything that is needed only when * reading in a new superblock is parsed here. @@ -383,6 +412,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) int ret = 0; char *compress_type; bool compress_force = false; + bool compress = false; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (cache_gen) @@ -409,7 +439,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) token = match_token(p, tokens, args); switch (token) { case Opt_degraded: - printk(KERN_INFO "btrfs: allowing degraded mounts\n"); + btrfs_info(root->fs_info, "allowing degraded mounts"); btrfs_set_opt(info->mount_opt, DEGRADED); break; case Opt_subvol: @@ -422,27 +452,45 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) */ break; case Opt_nodatasum: - printk(KERN_INFO "btrfs: setting nodatasum\n"); - btrfs_set_opt(info->mount_opt, NODATASUM); + btrfs_set_and_info(root, NODATASUM, + "setting nodatasum"); + break; + case Opt_datasum: + if (btrfs_test_opt(root, NODATASUM)) { + if (btrfs_test_opt(root, NODATACOW)) + btrfs_info(root->fs_info, "setting datasum, datacow enabled"); + else + btrfs_info(root->fs_info, "setting datasum"); + } + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: - if (!btrfs_test_opt(root, COMPRESS) || - !btrfs_test_opt(root, FORCE_COMPRESS)) { - printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n"); - } else { - printk(KERN_INFO "btrfs: setting nodatacow\n"); + if (!btrfs_test_opt(root, NODATACOW)) { + if (!btrfs_test_opt(root, COMPRESS) || + !btrfs_test_opt(root, FORCE_COMPRESS)) { + btrfs_info(root->fs_info, + "setting nodatacow, compression disabled"); + } else { + btrfs_info(root->fs_info, "setting nodatacow"); + } } btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); btrfs_set_opt(info->mount_opt, NODATACOW); btrfs_set_opt(info->mount_opt, NODATASUM); break; + case Opt_datacow: + btrfs_clear_and_info(root, NODATACOW, + "setting datacow"); + break; case Opt_compress_force: case Opt_compress_force_type: compress_force = true; /* Fallthrough */ case Opt_compress: case Opt_compress_type: + compress = true; if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -469,34 +517,36 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } if (compress_force) { - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); - pr_info("btrfs: force %s compression\n", - compress_type); - } else if (btrfs_test_opt(root, COMPRESS)) { - pr_info("btrfs: use %s compression\n", - compress_type); + btrfs_set_and_info(root, FORCE_COMPRESS, + "force %s compression", + compress_type); + } else if (compress) { + if (!btrfs_test_opt(root, COMPRESS)) + btrfs_info(root->fs_info, + "btrfs: use %s compression\n", + compress_type); } break; case Opt_ssd: - printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); - btrfs_set_opt(info->mount_opt, SSD); + btrfs_set_and_info(root, SSD, + "use ssd allocation scheme"); break; case Opt_ssd_spread: - printk(KERN_INFO "btrfs: use spread ssd " - "allocation scheme\n"); - btrfs_set_opt(info->mount_opt, SSD); - btrfs_set_opt(info->mount_opt, SSD_SPREAD); + btrfs_set_and_info(root, SSD_SPREAD, + "use spread ssd allocation scheme"); break; case Opt_nossd: - printk(KERN_INFO "btrfs: not using ssd allocation " - "scheme\n"); - btrfs_set_opt(info->mount_opt, NOSSD); + btrfs_clear_and_info(root, NOSSD, + "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); - btrfs_clear_opt(info->mount_opt, SSD_SPREAD); + break; + case Opt_barrier: + btrfs_clear_and_info(root, NOBARRIER, + "turning on barriers"); break; case Opt_nobarrier: - printk(KERN_INFO "btrfs: turning off barriers\n"); - btrfs_set_opt(info->mount_opt, NOBARRIER); + btrfs_set_and_info(root, NOBARRIER, + "turning off barriers"); break; case Opt_thread_pool: ret = match_int(&args[0], &intarg); @@ -516,11 +566,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) kfree(num); if (info->max_inline) { - info->max_inline = max_t(u64, + info->max_inline = min_t(u64, info->max_inline, root->sectorsize); } - printk(KERN_INFO "btrfs: max_inline at %llu\n", + btrfs_info(root->fs_info, "max_inline at %llu", info->max_inline); } else { ret = -ENOMEM; @@ -534,24 +584,34 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->alloc_start = memparse(num, NULL); mutex_unlock(&info->chunk_mutex); kfree(num); - printk(KERN_INFO - "btrfs: allocations start at %llu\n", + btrfs_info(root->fs_info, "allocations start at %llu", info->alloc_start); } else { ret = -ENOMEM; goto out; } break; + case Opt_acl: + root->fs_info->sb->s_flags |= MS_POSIXACL; + break; case Opt_noacl: root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; case Opt_notreelog: - printk(KERN_INFO "btrfs: disabling tree log\n"); - btrfs_set_opt(info->mount_opt, NOTREELOG); + btrfs_set_and_info(root, NOTREELOG, + "disabling tree log"); + break; + case Opt_treelog: + btrfs_clear_and_info(root, NOTREELOG, + "enabling tree log"); break; case Opt_flushoncommit: - printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); - btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); + btrfs_set_and_info(root, FLUSHONCOMMIT, + "turning on flush-on-commit"); + break; + case Opt_noflushoncommit: + btrfs_clear_and_info(root, FLUSHONCOMMIT, + "turning off flush-on-commit"); break; case Opt_ratio: ret = match_int(&args[0], &intarg); @@ -559,7 +619,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) goto out; } else if (intarg >= 0) { info->metadata_ratio = intarg; - printk(KERN_INFO "btrfs: metadata ratio %d\n", + btrfs_info(root->fs_info, "metadata ratio %d", info->metadata_ratio); } else { ret = -EINVAL; @@ -567,25 +627,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } break; case Opt_discard: - btrfs_set_opt(info->mount_opt, DISCARD); + btrfs_set_and_info(root, DISCARD, + "turning on discard"); + break; + case Opt_nodiscard: + btrfs_clear_and_info(root, DISCARD, + "turning off discard"); break; case Opt_space_cache: - btrfs_set_opt(info->mount_opt, SPACE_CACHE); + btrfs_set_and_info(root, SPACE_CACHE, + "enabling disk space caching"); break; case Opt_rescan_uuid_tree: btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: - printk(KERN_INFO "btrfs: disabling disk space caching\n"); - btrfs_clear_opt(info->mount_opt, SPACE_CACHE); + btrfs_clear_and_info(root, SPACE_CACHE, + "disabling disk space caching"); break; case Opt_inode_cache: - printk(KERN_INFO "btrfs: enabling inode map caching\n"); - btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); + btrfs_set_and_info(root, CHANGE_INODE_CACHE, + "enabling inode map caching"); + break; + case Opt_noinode_cache: + btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + "disabling inode map caching"); break; case Opt_clear_cache: - printk(KERN_INFO "btrfs: force clearing of disk cache\n"); - btrfs_set_opt(info->mount_opt, CLEAR_CACHE); + btrfs_set_and_info(root, CLEAR_CACHE, + "force clearing of disk cache"); break; case Opt_user_subvol_rm_allowed: btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); @@ -593,12 +663,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_enospc_debug: btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); break; + case Opt_noenospc_debug: + btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); + break; case Opt_defrag: - printk(KERN_INFO "btrfs: enabling auto defrag\n"); - btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); + btrfs_set_and_info(root, AUTO_DEFRAG, + "enabling auto defrag"); + break; + case Opt_nodefrag: + btrfs_clear_and_info(root, AUTO_DEFRAG, + "disabling auto defrag"); break; case Opt_recovery: - printk(KERN_INFO "btrfs: enabling auto recovery\n"); + btrfs_info(root->fs_info, "enabling auto recovery"); btrfs_set_opt(info->mount_opt, RECOVERY); break; case Opt_skip_balance: @@ -606,14 +683,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) break; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY case Opt_check_integrity_including_extent_data: - printk(KERN_INFO "btrfs: enabling check integrity" - " including extent data\n"); + btrfs_info(root->fs_info, + "enabling check integrity including extent data"); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity: - printk(KERN_INFO "btrfs: enabling check integrity\n"); + btrfs_info(root->fs_info, "enabling check integrity"); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity_print_mask: @@ -622,8 +699,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) goto out; } else if (intarg >= 0) { info->check_integrity_print_mask = intarg; - printk(KERN_INFO "btrfs:" - " check_integrity_print_mask 0x%x\n", + btrfs_info(root->fs_info, "check_integrity_print_mask 0x%x", info->check_integrity_print_mask); } else { ret = -EINVAL; @@ -634,8 +710,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_check_integrity_including_extent_data: case Opt_check_integrity: case Opt_check_integrity_print_mask: - printk(KERN_ERR "btrfs: support for check_integrity*" - " not compiled in!\n"); + btrfs_err(root->fs_info, + "support for check_integrity* not compiled in!"); ret = -EINVAL; goto out; #endif @@ -655,28 +731,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) intarg = 0; ret = match_int(&args[0], &intarg); if (ret < 0) { - printk(KERN_ERR - "btrfs: invalid commit interval\n"); + btrfs_err(root->fs_info, "invalid commit interval"); ret = -EINVAL; goto out; } if (intarg > 0) { if (intarg > 300) { - printk(KERN_WARNING - "btrfs: excessive commit interval %d\n", + btrfs_warn(root->fs_info, "excessive commit interval %d", intarg); } info->commit_interval = intarg; } else { - printk(KERN_INFO - "btrfs: using default commit interval %ds\n", + btrfs_info(root->fs_info, "using default commit interval %ds", BTRFS_DEFAULT_COMMIT_INTERVAL); info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; } break; case Opt_err: - printk(KERN_INFO "btrfs: unrecognized mount option " - "'%s'\n", p); + btrfs_info(root->fs_info, "unrecognized mount option '%s'", p); ret = -EINVAL; goto out; default: @@ -685,7 +757,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } out: if (!ret && btrfs_test_opt(root, SPACE_CACHE)) - printk(KERN_INFO "btrfs: disk space caching is enabled\n"); + btrfs_info(root->fs_info, "disk space caching is enabled"); kfree(orig); return ret; } @@ -748,7 +820,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, break; case Opt_subvolrootid: printk(KERN_WARNING - "btrfs: 'subvolrootid' mount option is deprecated and has no effect\n"); + "BTRFS: 'subvolrootid' mount option is deprecated and has " + "no effect\n"); break; case Opt_device: device_name = match_strdup(&args[0]); @@ -782,6 +855,7 @@ static struct dentry *get_default_root(struct super_block *sb, struct btrfs_path *path; struct btrfs_key location; struct inode *inode; + struct dentry *dentry; u64 dir_id; int new = 0; @@ -852,7 +926,13 @@ setup_root: return dget(sb->s_root); } - return d_obtain_alias(inode); + dentry = d_obtain_alias(inode); + if (!IS_ERR(dentry)) { + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_DISCONNECTED; + spin_unlock(&dentry->d_lock); + } + return dentry; } static int btrfs_fill_super(struct super_block *sb, @@ -877,7 +957,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_flags |= MS_I_VERSION; err = open_ctree(sb, fs_devices, (char *)data); if (err) { - printk("btrfs: open_ctree failed\n"); + printk(KERN_ERR "BTRFS: open_ctree failed\n"); return err; } @@ -1115,7 +1195,7 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, dput(root); root = ERR_PTR(-EINVAL); deactivate_locked_super(s); - printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", + printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", subvol_name); } @@ -1240,7 +1320,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, fs_info->thread_pool_size = new_pool_size; - printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n", + btrfs_info(fs_info, "resize thread pool %d -> %d", old_pool_size, new_pool_size); btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); @@ -1346,7 +1426,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } else { if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { btrfs_err(fs_info, - "Remounting read-write after error is not allowed\n"); + "Remounting read-write after error is not allowed"); ret = -EINVAL; goto restore; } @@ -1358,8 +1438,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(*flags & MS_RDONLY)) { - printk(KERN_WARNING - "Btrfs: too many missing devices, writeable remount is not allowed\n"); + btrfs_warn(fs_info, + "too many missing devices, writeable remount is not allowed"); ret = -EACCES; goto restore; } @@ -1384,16 +1464,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { - pr_warn("btrfs: failed to resume dev_replace\n"); + btrfs_warn(fs_info, "failed to resume dev_replace"); goto restore; } if (!fs_info->uuid_root) { - pr_info("btrfs: creating UUID tree\n"); + btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { - pr_warn("btrfs: failed to create the uuid tree" - "%d\n", ret); + btrfs_warn(fs_info, "failed to create the UUID tree %d", ret); goto restore; } } @@ -1773,7 +1852,7 @@ static int btrfs_interface_init(void) static void btrfs_interface_exit(void) { if (misc_deregister(&btrfs_misc) < 0) - printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); + printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n"); } static void btrfs_print_info(void) @@ -1818,10 +1897,16 @@ static int __init init_btrfs_fs(void) { int err; - err = btrfs_init_sysfs(); + err = btrfs_hash_init(); if (err) return err; + btrfs_props_init(); + + err = btrfs_init_sysfs(); + if (err) + goto free_hash; + btrfs_init_compress(); err = btrfs_init_cachep(); @@ -1895,6 +1980,8 @@ free_cachep: free_compress: btrfs_exit_compress(); btrfs_exit_sysfs(); +free_hash: + btrfs_hash_exit(); return err; } @@ -1913,9 +2000,10 @@ static void __exit exit_btrfs_fs(void) btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); btrfs_exit_compress(); + btrfs_hash_exit(); } -module_init(init_btrfs_fs) +late_initcall(init_btrfs_fs); module_exit(exit_btrfs_fs) MODULE_LICENSE("GPL"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5b326cd60a4a..865f4cf9a769 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -22,24 +22,647 @@ #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/kobject.h> +#include <linux/bug.h> +#include <linux/genhd.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "sysfs.h" +#include "volumes.h" + +static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); + +static u64 get_features(struct btrfs_fs_info *fs_info, + enum btrfs_feature_set set) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + if (set == FEAT_COMPAT) + return btrfs_super_compat_flags(disk_super); + else if (set == FEAT_COMPAT_RO) + return btrfs_super_compat_ro_flags(disk_super); + else + return btrfs_super_incompat_flags(disk_super); +} + +static void set_features(struct btrfs_fs_info *fs_info, + enum btrfs_feature_set set, u64 features) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + if (set == FEAT_COMPAT) + btrfs_set_super_compat_flags(disk_super, features); + else if (set == FEAT_COMPAT_RO) + btrfs_set_super_compat_ro_flags(disk_super, features); + else + btrfs_set_super_incompat_flags(disk_super, features); +} + +static int can_modify_feature(struct btrfs_feature_attr *fa) +{ + int val = 0; + u64 set, clear; + switch (fa->feature_set) { + case FEAT_COMPAT: + set = BTRFS_FEATURE_COMPAT_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; + break; + case FEAT_COMPAT_RO: + set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; + break; + case FEAT_INCOMPAT: + set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; + clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; + break; + default: + printk(KERN_WARNING "btrfs: sysfs: unknown feature set %d\n", + fa->feature_set); + return 0; + } + + if (set & fa->feature_bit) + val |= 1; + if (clear & fa->feature_bit) + val |= 2; + + return val; +} + +static ssize_t btrfs_feature_attr_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + int val = 0; + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); + if (fs_info) { + u64 features = get_features(fs_info, fa->feature_set); + if (features & fa->feature_bit) + val = 1; + } else + val = can_modify_feature(fa); + + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} + +static ssize_t btrfs_feature_attr_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t count) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); + struct btrfs_trans_handle *trans; + u64 features, set, clear; + unsigned long val; + int ret; + + fs_info = to_fs_info(kobj); + if (!fs_info) + return -EPERM; + + ret = kstrtoul(skip_spaces(buf), 0, &val); + if (ret) + return ret; + + if (fa->feature_set == FEAT_COMPAT) { + set = BTRFS_FEATURE_COMPAT_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR; + } else if (fa->feature_set == FEAT_COMPAT_RO) { + set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET; + clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR; + } else { + set = BTRFS_FEATURE_INCOMPAT_SAFE_SET; + clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; + } + + features = get_features(fs_info, fa->feature_set); + + /* Nothing to do */ + if ((val && (features & fa->feature_bit)) || + (!val && !(features & fa->feature_bit))) + return count; + + if ((val && !(set & fa->feature_bit)) || + (!val && !(clear & fa->feature_bit))) { + btrfs_info(fs_info, + "%sabling feature %s on mounted fs is not supported.", + val ? "En" : "Dis", fa->kobj_attr.attr.name); + return -EPERM; + } + + btrfs_info(fs_info, "%s %s feature flag", + val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); + + trans = btrfs_start_transaction(fs_info->fs_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + spin_lock(&fs_info->super_lock); + features = get_features(fs_info, fa->feature_set); + if (val) + features |= fa->feature_bit; + else + features &= ~fa->feature_bit; + set_features(fs_info, fa->feature_set, features); + spin_unlock(&fs_info->super_lock); + + ret = btrfs_commit_transaction(trans, fs_info->fs_root); + if (ret) + return ret; + + return count; +} + +static umode_t btrfs_feature_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + umode_t mode = attr->mode; + + if (fs_info) { + struct btrfs_feature_attr *fa; + u64 features; + + fa = attr_to_btrfs_feature_attr(attr); + features = get_features(fs_info, fa->feature_set); + + if (can_modify_feature(fa)) + mode |= S_IWUSR; + else if (!(features & fa->feature_bit)) + mode = 0; + } + + return mode; +} + +BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF); +BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); +BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); +BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); +BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA); +BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); +BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); +BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); +BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); + +static struct attribute *btrfs_supported_feature_attrs[] = { + BTRFS_FEAT_ATTR_PTR(mixed_backref), + BTRFS_FEAT_ATTR_PTR(default_subvol), + BTRFS_FEAT_ATTR_PTR(mixed_groups), + BTRFS_FEAT_ATTR_PTR(compress_lzo), + BTRFS_FEAT_ATTR_PTR(big_metadata), + BTRFS_FEAT_ATTR_PTR(extended_iref), + BTRFS_FEAT_ATTR_PTR(raid56), + BTRFS_FEAT_ATTR_PTR(skinny_metadata), + BTRFS_FEAT_ATTR_PTR(no_holes), + NULL +}; + +static const struct attribute_group btrfs_feature_attr_group = { + .name = "features", + .is_visible = btrfs_feature_visible, + .attrs = btrfs_supported_feature_attrs, +}; + +static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) +{ + u64 val; + if (lock) + spin_lock(lock); + val = *value_ptr; + if (lock) + spin_unlock(lock); + return snprintf(buf, PAGE_SIZE, "%llu\n", val); +} + +static ssize_t global_rsv_size_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); +} +BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show); + +static ssize_t global_rsv_reserved_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent); + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); +} +BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); + +#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) + +static ssize_t raid_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf); +BTRFS_RAID_ATTR(total_bytes, raid_bytes_show); +BTRFS_RAID_ATTR(used_bytes, raid_bytes_show); + +static ssize_t raid_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) + +{ + struct btrfs_space_info *sinfo = to_space_info(kobj->parent); + struct btrfs_block_group_cache *block_group; + int index = kobj - sinfo->block_group_kobjs; + u64 val = 0; + + down_read(&sinfo->groups_sem); + list_for_each_entry(block_group, &sinfo->block_groups[index], list) { + if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes)) + val += block_group->key.offset; + else + val += btrfs_block_group_used(&block_group->item); + } + up_read(&sinfo->groups_sem); + return snprintf(buf, PAGE_SIZE, "%llu\n", val); +} + +static struct attribute *raid_attributes[] = { + BTRFS_RAID_ATTR_PTR(total_bytes), + BTRFS_RAID_ATTR_PTR(used_bytes), + NULL +}; + +static void release_raid_kobj(struct kobject *kobj) +{ + kobject_put(kobj->parent); +} + +struct kobj_type btrfs_raid_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = release_raid_kobj, + .default_attrs = raid_attributes, +}; + +#define SPACE_INFO_ATTR(field) \ +static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_space_info *sinfo = to_space_info(kobj); \ + return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ +} \ +BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field) + +static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned); + return snprintf(buf, PAGE_SIZE, "%lld\n", val); +} + +SPACE_INFO_ATTR(flags); +SPACE_INFO_ATTR(total_bytes); +SPACE_INFO_ATTR(bytes_used); +SPACE_INFO_ATTR(bytes_pinned); +SPACE_INFO_ATTR(bytes_reserved); +SPACE_INFO_ATTR(bytes_may_use); +SPACE_INFO_ATTR(disk_used); +SPACE_INFO_ATTR(disk_total); +BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned); + +static struct attribute *space_info_attrs[] = { + BTRFS_ATTR_PTR(flags), + BTRFS_ATTR_PTR(total_bytes), + BTRFS_ATTR_PTR(bytes_used), + BTRFS_ATTR_PTR(bytes_pinned), + BTRFS_ATTR_PTR(bytes_reserved), + BTRFS_ATTR_PTR(bytes_may_use), + BTRFS_ATTR_PTR(disk_used), + BTRFS_ATTR_PTR(disk_total), + BTRFS_ATTR_PTR(total_bytes_pinned), + NULL, +}; + +static void space_info_release(struct kobject *kobj) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + percpu_counter_destroy(&sinfo->total_bytes_pinned); + kfree(sinfo); +} + +struct kobj_type space_info_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = space_info_release, + .default_attrs = space_info_attrs, +}; + +static const struct attribute *allocation_attrs[] = { + BTRFS_ATTR_PTR(global_rsv_reserved), + BTRFS_ATTR_PTR(global_rsv_size), + NULL, +}; + +static ssize_t btrfs_label_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label); +} + +static ssize_t btrfs_label_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + struct btrfs_trans_handle *trans; + struct btrfs_root *root = fs_info->fs_root; + int ret; + + if (len >= BTRFS_LABEL_SIZE) { + pr_err("BTRFS: unable to set label with more than %d bytes\n", + BTRFS_LABEL_SIZE - 1); + return -EINVAL; + } + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + spin_lock(&root->fs_info->super_lock); + strcpy(fs_info->super_copy->label, buf); + spin_unlock(&root->fs_info->super_lock); + ret = btrfs_commit_transaction(trans, root); + + if (!ret) + return len; + + return ret; +} +BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); + +static struct attribute *btrfs_attrs[] = { + BTRFS_ATTR_PTR(label), + NULL, +}; + +static void btrfs_release_super_kobj(struct kobject *kobj) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + complete(&fs_info->kobj_unregister); +} + +static struct kobj_type btrfs_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = btrfs_release_super_kobj, + .default_attrs = btrfs_attrs, +}; + +static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) +{ + if (kobj->ktype != &btrfs_ktype) + return NULL; + return container_of(kobj, struct btrfs_fs_info, super_kobj); +} + +#define NUM_FEATURE_BITS 64 +static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13]; +static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS]; + +static u64 supported_feature_masks[3] = { + [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, + [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, + [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, +}; + +static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) +{ + int set; + + for (set = 0; set < FEAT_MAX; set++) { + int i; + struct attribute *attrs[2]; + struct attribute_group agroup = { + .name = "features", + .attrs = attrs, + }; + u64 features = get_features(fs_info, set); + features &= ~supported_feature_masks[set]; + + if (!features) + continue; + + attrs[1] = NULL; + for (i = 0; i < NUM_FEATURE_BITS; i++) { + struct btrfs_feature_attr *fa; + + if (!(features & (1ULL << i))) + continue; + + fa = &btrfs_feature_attrs[set][i]; + attrs[0] = &fa->kobj_attr.attr; + if (add) { + int ret; + ret = sysfs_merge_group(&fs_info->super_kobj, + &agroup); + if (ret) + return ret; + } else + sysfs_unmerge_group(&fs_info->super_kobj, + &agroup); + } + + } + return 0; +} + +static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +{ + kobject_del(&fs_info->super_kobj); + kobject_put(&fs_info->super_kobj); + wait_for_completion(&fs_info->kobj_unregister); +} + +void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +{ + if (fs_info->space_info_kobj) { + sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); + kobject_del(fs_info->space_info_kobj); + kobject_put(fs_info->space_info_kobj); + } + kobject_del(fs_info->device_dir_kobj); + kobject_put(fs_info->device_dir_kobj); + addrm_unknown_feature_attrs(fs_info, false); + sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group); + __btrfs_sysfs_remove_one(fs_info); +} + +const char * const btrfs_feature_set_names[3] = { + [FEAT_COMPAT] = "compat", + [FEAT_COMPAT_RO] = "compat_ro", + [FEAT_INCOMPAT] = "incompat", +}; + +char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) +{ + size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */ + int len = 0; + int i; + char *str; + + str = kmalloc(bufsize, GFP_KERNEL); + if (!str) + return str; + + for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { + const char *name; + + if (!(flags & (1ULL << i))) + continue; + + name = btrfs_feature_attrs[set][i].kobj_attr.attr.name; + len += snprintf(str + len, bufsize - len, "%s%s", + len ? "," : "", name); + } + + return str; +} + +static void init_feature_attrs(void) +{ + struct btrfs_feature_attr *fa; + int set, i; + + BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) != + ARRAY_SIZE(btrfs_feature_attrs)); + BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) != + ARRAY_SIZE(btrfs_feature_attrs[0])); + + memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); + memset(btrfs_unknown_feature_names, 0, + sizeof(btrfs_unknown_feature_names)); + + for (i = 0; btrfs_supported_feature_attrs[i]; i++) { + struct btrfs_feature_attr *sfa; + struct attribute *a = btrfs_supported_feature_attrs[i]; + int bit; + sfa = attr_to_btrfs_feature_attr(a); + bit = ilog2(sfa->feature_bit); + fa = &btrfs_feature_attrs[sfa->feature_set][bit]; + + fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name; + } + + for (set = 0; set < FEAT_MAX; set++) { + for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) { + char *name = btrfs_unknown_feature_names[set][i]; + fa = &btrfs_feature_attrs[set][i]; + + if (fa->kobj_attr.attr.name) + continue; + + snprintf(name, 13, "%s:%u", + btrfs_feature_set_names[set], i); + + fa->kobj_attr.attr.name = name; + fa->kobj_attr.attr.mode = S_IRUGO; + fa->feature_set = set; + fa->feature_bit = 1ULL << i; + } + } +} + +static int add_device_membership(struct btrfs_fs_info *fs_info) +{ + int error = 0; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *dev; + + fs_info->device_dir_kobj = kobject_create_and_add("devices", + &fs_info->super_kobj); + if (!fs_info->device_dir_kobj) + return -ENOMEM; + + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!dev->bdev) + continue; + + disk = dev->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + error = sysfs_create_link(fs_info->device_dir_kobj, + disk_kobj, disk_kobj->name); + if (error) + break; + } + + return error; +} /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; +int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) +{ + int error; + + init_completion(&fs_info->kobj_unregister); + fs_info->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL, + "%pU", fs_info->fsid); + if (error) + return error; + + error = sysfs_create_group(&fs_info->super_kobj, + &btrfs_feature_attr_group); + if (error) { + __btrfs_sysfs_remove_one(fs_info); + return error; + } + + error = addrm_unknown_feature_attrs(fs_info, true); + if (error) + goto failure; + + error = add_device_membership(fs_info); + if (error) + goto failure; + + fs_info->space_info_kobj = kobject_create_and_add("allocation", + &fs_info->super_kobj); + if (!fs_info->space_info_kobj) { + error = -ENOMEM; + goto failure; + } + + error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); + if (error) + goto failure; + + return 0; +failure: + btrfs_sysfs_remove_one(fs_info); + return error; +} + int btrfs_init_sysfs(void) { + int ret; btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); if (!btrfs_kset) return -ENOMEM; + + init_feature_attrs(); + + ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); + if (ret) { + kset_unregister(btrfs_kset); + return ret; + } + return 0; } void btrfs_exit_sysfs(void) { + sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); kset_unregister(btrfs_kset); } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h new file mode 100644 index 000000000000..f3cea3710d44 --- /dev/null +++ b/fs/btrfs/sysfs.h @@ -0,0 +1,64 @@ +#ifndef _BTRFS_SYSFS_H_ +#define _BTRFS_SYSFS_H_ + +enum btrfs_feature_set { + FEAT_COMPAT, + FEAT_COMPAT_RO, + FEAT_INCOMPAT, + FEAT_MAX +}; + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \ +static struct kobj_attribute btrfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, _mode, _show, _store) +#define BTRFS_ATTR(_name, _mode, _show) \ + BTRFS_ATTR_RW(_name, _mode, _show, NULL) +#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) + +#define BTRFS_RAID_ATTR(_name, _show) \ +static struct kobj_attribute btrfs_raid_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) +#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) + + +struct btrfs_feature_attr { + struct kobj_attribute kobj_attr; + enum btrfs_feature_set feature_set; + u64 feature_bit; +}; + +#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \ +static struct btrfs_feature_attr btrfs_attr_##_name = { \ + .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ + btrfs_feature_attr_show, \ + btrfs_feature_attr_store), \ + .feature_set = _feature_set, \ + .feature_bit = _prefix ##_## _feature_bit, \ +} +#define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr) + +#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) + +/* convert from attribute */ +#define to_btrfs_feature_attr(a) \ + container_of(a, struct btrfs_feature_attr, kobj_attr) +#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr) +#define attr_to_btrfs_feature_attr(a) \ + to_btrfs_feature_attr(attr_to_btrfs_attr(a)) +char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); +extern const char * const btrfs_feature_set_names[3]; +extern struct kobj_type space_info_ktype; +extern struct kobj_type btrfs_raid_ktype; +#endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index b353bc806ca0..312560a9123d 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -21,7 +21,7 @@ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -#define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__) +#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) int btrfs_test_free_space_cache(void); int btrfs_test_extent_buffer_operations(void); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c6a872a8a468..34cd83184c4a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -62,7 +62,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); - WARN_ON(transaction->delayed_refs.root.rb_node); + WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); while (!list_empty(&transaction->pending_chunks)) { struct extent_map *em; @@ -183,8 +183,8 @@ loop: atomic_set(&cur_trans->use_count, 2); cur_trans->start_time = get_seconds(); - cur_trans->delayed_refs.root = RB_ROOT; - cur_trans->delayed_refs.num_entries = 0; + cur_trans->delayed_refs.href_root = RB_ROOT; + atomic_set(&cur_trans->delayed_refs.num_entries, 0); cur_trans->delayed_refs.num_heads_ready = 0; cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; @@ -196,17 +196,14 @@ loop: */ smp_mb(); if (!list_empty(&fs_info->tree_mod_seq_list)) - WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when " + WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when " "creating a fresh transaction\n"); if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) - WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " + WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when " "creating a fresh transaction\n"); atomic64_set(&fs_info->tree_mod_seq, 0); spin_lock_init(&cur_trans->delayed_refs.lock); - atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); - atomic_set(&cur_trans->delayed_refs.ref_seq, 0); - init_waitqueue_head(&cur_trans->delayed_refs.wait); INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->ordered_operations); @@ -472,6 +469,7 @@ again: h->type = type; h->allocating_chunk = false; h->reloc_reserved = false; + h->sync = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); @@ -647,7 +645,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (root->fs_info->global_block_rsv.space_info->full && - btrfs_should_throttle_delayed_refs(trans, root)) + btrfs_check_space_for_delayed_refs(trans, root)) return 1; return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); @@ -711,8 +709,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_create_pending_block_groups(trans, root); trans->delayed_ref_updates = 0; - if (btrfs_should_throttle_delayed_refs(trans, root)) { - cur = max_t(unsigned long, cur, 1); + if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) { + cur = max_t(unsigned long, cur, 32); trans->delayed_ref_updates = 0; btrfs_run_delayed_refs(trans, root, cur); } @@ -788,12 +786,6 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, return __btrfs_end_transaction(trans, root, 1); } -int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - return __btrfs_end_transaction(trans, root, 1); -} - /* * when btree blocks are allocated, they have some corresponding bits set for * them in one of two extent_io trees. This is used to make sure all of @@ -1105,7 +1097,7 @@ int btrfs_defrag_root(struct btrfs_root *root) break; if (btrfs_defrag_cancelled(root->fs_info)) { - printk(KERN_DEBUG "btrfs: defrag_root cancelled\n"); + pr_debug("BTRFS: defrag_root cancelled\n"); ret = -EAGAIN; break; } @@ -1746,6 +1738,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; btrfs_wait_delalloc_flush(root->fs_info); + + btrfs_scrub_pause(root); /* * Ok now we need to make sure to block out any other joins while we * commit the transaction. We could have started a join before setting @@ -1810,7 +1804,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, WARN_ON(cur_trans != trans->transaction); - btrfs_scrub_pause(root); /* btrfs_commit_tree_roots is responsible for getting the * various roots consistent with each other. Every pointer * in the tree of tree roots has to point to the most up to date @@ -1833,6 +1826,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } + /* + * Since the transaction is done, we should set the inode map cache flag + * before any other comming transaction. + */ + if (btrfs_test_opt(root, CHANGE_INODE_CACHE)) + btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); + else + btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); + /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ @@ -1975,10 +1977,23 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) } root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); + /* + * Make sure root is not involved in send, + * if we fail with first root, we return + * directly rather than continue. + */ + spin_lock(&root->root_item_lock); + if (root->send_in_progress) { + spin_unlock(&fs_info->trans_lock); + spin_unlock(&root->root_item_lock); + return 0; + } + spin_unlock(&root->root_item_lock); + list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); - pr_debug("btrfs: cleaner removing %llu\n", root->objectid); + pr_debug("BTRFS: cleaner removing %llu\n", root->objectid); btrfs_kill_all_delayed_nodes(root); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 7657d115067d..6ac037e9f9f0 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -93,6 +93,7 @@ struct btrfs_trans_handle { short adding_csums; bool allocating_chunk; bool reloc_reserved; + bool sync; unsigned int type; /* * this root is only needed to validate that the root passed to @@ -154,8 +155,6 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, int wait_for_unblock); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, - struct btrfs_root *root); int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_throttle(struct btrfs_root *root); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9f7fc51ca334..39d83da03e03 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -570,7 +570,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (btrfs_file_extent_disk_bytenr(eb, item) == 0) nbytes = 0; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_inline_len(eb, item); + size = btrfs_file_extent_inline_len(eb, slot, item); nbytes = btrfs_file_extent_ram_bytes(eb, item); extent_end = ALIGN(start + size, root->sectorsize); } else { @@ -1238,7 +1238,8 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { int ret; - ret = btrfs_find_orphan_item(root, offset); + ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID, + offset, BTRFS_ORPHAN_ITEM_KEY, NULL); if (ret > 0) ret = btrfs_insert_orphan_item(trans, root, offset); return ret; @@ -3194,7 +3195,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, - struct extent_buffer *src, + struct btrfs_path *src_path, u64 *last_extent, int start_slot, int nr, int inode_only) { unsigned long src_offset; @@ -3202,6 +3203,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_root *log = BTRFS_I(inode)->root->log_root; struct btrfs_file_extent_item *extent; struct btrfs_inode_item *inode_item; + struct extent_buffer *src = src_path->nodes[0]; + struct btrfs_key first_key, last_key, key; int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -3209,6 +3212,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int i; struct list_head ordered_sums; int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + bool has_extents = false; + bool need_find_last_extent = (*last_extent == 0); + bool done = false; INIT_LIST_HEAD(&ordered_sums); @@ -3217,6 +3223,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, if (!ins_data) return -ENOMEM; + first_key.objectid = (u64)-1; + ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); @@ -3237,6 +3245,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + i); + if ((i == (nr - 1))) + last_key = ins_keys[i]; + if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_path->slots[0], @@ -3248,6 +3259,21 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset, ins_sizes[i]); } + /* + * We set need_find_last_extent here in case we know we were + * processing other items and then walk into the first extent in + * the inode. If we don't hit an extent then nothing changes, + * we'll do the last search the next time around. + */ + if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { + has_extents = true; + if (need_find_last_extent && + first_key.objectid == (u64)-1) + first_key = ins_keys[i]; + } else { + need_find_last_extent = false; + } + /* take a reference on file data extents so that truncates * or deletes of this inode don't have to relog the inode * again @@ -3312,6 +3338,128 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, list_del(&sums->list); kfree(sums); } + + if (!has_extents) + return ret; + + /* + * Because we use btrfs_search_forward we could skip leaves that were + * not modified and then assume *last_extent is valid when it really + * isn't. So back up to the previous leaf and read the end of the last + * extent before we go and fill in holes. + */ + if (need_find_last_extent) { + u64 len; + + ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); + if (ret < 0) + return ret; + if (ret) + goto fill_holes; + if (src_path->slots[0]) + src_path->slots[0]--; + src = src_path->nodes[0]; + btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) + goto fill_holes; + extent = btrfs_item_ptr(src, src_path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(src, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(src, + src_path->slots[0], + extent); + *last_extent = ALIGN(key.offset + len, + log->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(src, extent); + *last_extent = key.offset + len; + } + } +fill_holes: + /* So we did prev_leaf, now we need to move to the next leaf, but a few + * things could have happened + * + * 1) A merge could have happened, so we could currently be on a leaf + * that holds what we were copying in the first place. + * 2) A split could have happened, and now not all of the items we want + * are on the same leaf. + * + * So we need to adjust how we search for holes, we need to drop the + * path and re-search for the first extent key we found, and then walk + * forward until we hit the last one we copied. + */ + if (need_find_last_extent) { + /* btrfs_prev_leaf could return 1 without releasing the path */ + btrfs_release_path(src_path); + ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, + src_path, 0, 0); + if (ret < 0) + return ret; + ASSERT(ret == 0); + src = src_path->nodes[0]; + i = src_path->slots[0]; + } else { + i = start_slot; + } + + /* + * Ok so here we need to go through and fill in any holes we may have + * to make sure that holes are punched for those areas in case they had + * extents previously. + */ + while (!done) { + u64 offset, len; + u64 extent_end; + + if (i >= btrfs_header_nritems(src_path->nodes[0])) { + ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); + if (ret < 0) + return ret; + ASSERT(ret == 0); + src = src_path->nodes[0]; + i = 0; + } + + btrfs_item_key_to_cpu(src, &key, i); + if (!btrfs_comp_cpu_keys(&key, &last_key)) + done = true; + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) { + i++; + continue; + } + extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(src, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(src, i, extent); + extent_end = ALIGN(key.offset + len, log->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(src, extent); + extent_end = key.offset + len; + } + i++; + + if (*last_extent == key.offset) { + *last_extent = extent_end; + continue; + } + offset = *last_extent; + len = key.offset - *last_extent; + ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), + offset, 0, 0, len, 0, len, 0, + 0, 0); + if (ret) + break; + *last_extent = offset + len; + } + /* + * Need to let the callers know we dropped the path so they should + * re-search. + */ + if (!ret && need_find_last_extent) + ret = 1; return ret; } @@ -3349,21 +3497,27 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int ret; int index = log->log_transid % 2; bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - - ret = __btrfs_drop_extents(trans, log, inode, path, em->start, - em->start + em->len, NULL, 0); - if (ret) - return ret; + int extent_inserted = 0; INIT_LIST_HEAD(&ordered_sums); btrfs_init_map_token(&token); - key.objectid = btrfs_ino(inode); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = em->start; - ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, + em->start + em->len, NULL, 0, 1, + sizeof(*fi), &extent_inserted); if (ret) return ret; + + if (!extent_inserted) { + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(*fi)); + if (ret) + return ret; + } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -3485,7 +3639,11 @@ again: * start over after this. */ - wait_event(ordered->wait, ordered->csum_bytes_left == 0); + if (ordered->csum_bytes_left) { + btrfs_start_ordered_extent(inode, ordered, 0); + wait_event(ordered->wait, + ordered->csum_bytes_left == 0); + } list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); @@ -3630,6 +3788,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src = NULL; + u64 last_extent = 0; int err = 0; int ret; int nritems; @@ -3745,11 +3904,15 @@ again: goto next_slot; } - ret = copy_items(trans, inode, dst_path, src, ins_start_slot, - ins_nr, inode_only); - if (ret) { + ret = copy_items(trans, inode, dst_path, path, &last_extent, + ins_start_slot, ins_nr, inode_only); + if (ret < 0) { err = ret; goto out_unlock; + } if (ret) { + ins_nr = 0; + btrfs_release_path(path); + continue; } ins_nr = 1; ins_start_slot = path->slots[0]; @@ -3763,13 +3926,14 @@ next_slot: goto again; } if (ins_nr) { - ret = copy_items(trans, inode, dst_path, src, - ins_start_slot, + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, ins_nr, inode_only); - if (ret) { + if (ret < 0) { err = ret; goto out_unlock; } + ret = 0; ins_nr = 0; } btrfs_release_path(path); @@ -3784,12 +3948,13 @@ next_slot: } } if (ins_nr) { - ret = copy_items(trans, inode, dst_path, src, ins_start_slot, - ins_nr, inode_only); - if (ret) { + ret = copy_items(trans, inode, dst_path, path, &last_extent, + ins_start_slot, ins_nr, inode_only); + if (ret < 0) { err = ret; goto out_unlock; } + ret = 0; ins_nr = 0; } diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index b0a523b2c60e..840a38b2778a 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -5,8 +5,8 @@ */ #include <linux/slab.h> -#include <linux/export.h> #include "ulist.h" +#include "ctree.h" /* * ulist is a generic data structure to hold a collection of unique u64 @@ -14,10 +14,6 @@ * enumerating it. * It is possible to store an auxiliary value along with the key. * - * The implementation is preliminary and can probably be sped up - * significantly. A first step would be to store the values in an rbtree - * as soon as ULIST_SIZE is exceeded. - * * A sample usage for ulists is the enumeration of directed graphs without * visiting a node twice. The pseudo-code could look like this: * @@ -50,12 +46,10 @@ */ void ulist_init(struct ulist *ulist) { - ulist->nnodes = 0; - ulist->nodes = ulist->int_nodes; - ulist->nodes_alloced = ULIST_SIZE; + INIT_LIST_HEAD(&ulist->nodes); ulist->root = RB_ROOT; + ulist->nnodes = 0; } -EXPORT_SYMBOL(ulist_init); /** * ulist_fini - free up additionally allocated memory for the ulist @@ -64,18 +58,17 @@ EXPORT_SYMBOL(ulist_init); * This is useful in cases where the base 'struct ulist' has been statically * allocated. */ -void ulist_fini(struct ulist *ulist) +static void ulist_fini(struct ulist *ulist) { - /* - * The first ULIST_SIZE elements are stored inline in struct ulist. - * Only if more elements are alocated they need to be freed. - */ - if (ulist->nodes_alloced > ULIST_SIZE) - kfree(ulist->nodes); - ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */ + struct ulist_node *node; + struct ulist_node *next; + + list_for_each_entry_safe(node, next, &ulist->nodes, list) { + kfree(node); + } ulist->root = RB_ROOT; + INIT_LIST_HEAD(&ulist->nodes); } -EXPORT_SYMBOL(ulist_fini); /** * ulist_reinit - prepare a ulist for reuse @@ -89,7 +82,6 @@ void ulist_reinit(struct ulist *ulist) ulist_fini(ulist); ulist_init(ulist); } -EXPORT_SYMBOL(ulist_reinit); /** * ulist_alloc - dynamically allocate a ulist @@ -108,7 +100,6 @@ struct ulist *ulist_alloc(gfp_t gfp_mask) return ulist; } -EXPORT_SYMBOL(ulist_alloc); /** * ulist_free - free dynamically allocated ulist @@ -123,7 +114,6 @@ void ulist_free(struct ulist *ulist) ulist_fini(ulist); kfree(ulist); } -EXPORT_SYMBOL(ulist_free); static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) { @@ -192,63 +182,32 @@ int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask) int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask) { - int ret = 0; - struct ulist_node *node = NULL; + int ret; + struct ulist_node *node; + node = ulist_rbtree_search(ulist, val); if (node) { if (old_aux) *old_aux = node->aux; return 0; } + node = kmalloc(sizeof(*node), gfp_mask); + if (!node) + return -ENOMEM; - if (ulist->nnodes >= ulist->nodes_alloced) { - u64 new_alloced = ulist->nodes_alloced + 128; - struct ulist_node *new_nodes; - void *old = NULL; - int i; - - for (i = 0; i < ulist->nnodes; i++) - rb_erase(&ulist->nodes[i].rb_node, &ulist->root); - - /* - * if nodes_alloced == ULIST_SIZE no memory has been allocated - * yet, so pass NULL to krealloc - */ - if (ulist->nodes_alloced > ULIST_SIZE) - old = ulist->nodes; + node->val = val; + node->aux = aux; +#ifdef CONFIG_BTRFS_DEBUG + node->seqnum = ulist->nnodes; +#endif - new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced, - gfp_mask); - if (!new_nodes) - return -ENOMEM; - - if (!old) - memcpy(new_nodes, ulist->int_nodes, - sizeof(ulist->int_nodes)); - - ulist->nodes = new_nodes; - ulist->nodes_alloced = new_alloced; - - /* - * krealloc actually uses memcpy, which does not copy rb_node - * pointers, so we have to do it ourselves. Otherwise we may - * be bitten by crashes. - */ - for (i = 0; i < ulist->nnodes; i++) { - ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]); - if (ret < 0) - return ret; - } - } - ulist->nodes[ulist->nnodes].val = val; - ulist->nodes[ulist->nnodes].aux = aux; - ret = ulist_rbtree_insert(ulist, &ulist->nodes[ulist->nnodes]); - BUG_ON(ret); - ++ulist->nnodes; + ret = ulist_rbtree_insert(ulist, node); + ASSERT(!ret); + list_add_tail(&node->list, &ulist->nodes); + ulist->nnodes++; return 1; } -EXPORT_SYMBOL(ulist_add); /** * ulist_next - iterate ulist @@ -268,11 +227,25 @@ EXPORT_SYMBOL(ulist_add); */ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) { - if (ulist->nnodes == 0) + struct ulist_node *node; + + if (list_empty(&ulist->nodes)) return NULL; - if (uiter->i < 0 || uiter->i >= ulist->nnodes) + if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes) return NULL; - - return &ulist->nodes[uiter->i++]; + if (uiter->cur_list) { + uiter->cur_list = uiter->cur_list->next; + } else { + uiter->cur_list = ulist->nodes.next; +#ifdef CONFIG_BTRFS_DEBUG + uiter->i = 0; +#endif + } + node = list_entry(uiter->cur_list, struct ulist_node, list); +#ifdef CONFIG_BTRFS_DEBUG + ASSERT(node->seqnum == uiter->i); + ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes); + uiter->i++; +#endif + return node; } -EXPORT_SYMBOL(ulist_next); diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index fb36731074b5..7f78cbf5cf41 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -17,18 +17,12 @@ * enumerating it. * It is possible to store an auxiliary value along with the key. * - * The implementation is preliminary and can probably be sped up - * significantly. A first step would be to store the values in an rbtree - * as soon as ULIST_SIZE is exceeded. */ - -/* - * number of elements statically allocated inside struct ulist - */ -#define ULIST_SIZE 16 - struct ulist_iterator { +#ifdef CONFIG_BTRFS_DEBUG int i; +#endif + struct list_head *cur_list; /* hint to start search */ }; /* @@ -37,6 +31,12 @@ struct ulist_iterator { struct ulist_node { u64 val; /* value to store */ u64 aux; /* auxiliary value saved along with the val */ + +#ifdef CONFIG_BTRFS_DEBUG + int seqnum; /* sequence number this node is added */ +#endif + + struct list_head list; /* used to link node */ struct rb_node rb_node; /* used to speed up search */ }; @@ -46,28 +46,11 @@ struct ulist { */ unsigned long nnodes; - /* - * number of nodes we already have room for - */ - unsigned long nodes_alloced; - - /* - * pointer to the array storing the elements. The first ULIST_SIZE - * elements are stored inline. In this case the it points to int_nodes. - * After exceeding ULIST_SIZE, dynamic memory is allocated. - */ - struct ulist_node *nodes; - + struct list_head nodes; struct rb_root root; - - /* - * inline storage space for the first ULIST_SIZE entries - */ - struct ulist_node int_nodes[ULIST_SIZE]; }; void ulist_init(struct ulist *ulist); -void ulist_fini(struct ulist *ulist); void ulist_reinit(struct ulist *ulist); struct ulist *ulist_alloc(gfp_t gfp_mask); void ulist_free(struct ulist *ulist); @@ -77,6 +60,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter); -#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0) +#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL) #endif diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index fbda90004fe9..f6a4c03ee7d8 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -69,7 +69,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, ret = -ENOENT; if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("btrfs: uuid item with illegal size %lu!\n", + btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); goto out; } @@ -137,7 +137,8 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, offset = btrfs_item_ptr_offset(eb, slot); offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); } else if (ret < 0) { - pr_warn("btrfs: insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!\n", + btrfs_warn(uuid_root->fs_info, "insert uuid item failed %d " + "(0x%016llx, 0x%016llx) type %u!", ret, (unsigned long long)key.objectid, (unsigned long long)key.offset, type); goto out; @@ -183,7 +184,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); if (ret < 0) { - pr_warn("btrfs: error %d while searching for uuid item!\n", + btrfs_warn(uuid_root->fs_info, "error %d while searching for uuid item!", ret); goto out; } @@ -197,7 +198,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, offset = btrfs_item_ptr_offset(eb, slot); item_size = btrfs_item_size_nr(eb, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("btrfs: uuid item with illegal size %lu!\n", + btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); ret = -ENOENT; goto out; @@ -299,7 +300,7 @@ again_search_slot: offset = btrfs_item_ptr_offset(leaf, slot); item_size = btrfs_item_size_nr(leaf, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("btrfs: uuid item with illegal size %lu!\n", + btrfs_warn(fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); goto skip; } @@ -349,6 +350,6 @@ skip: out: btrfs_free_path(path); if (ret) - pr_warn("btrfs: btrfs_uuid_tree_iterate failed %d\n", ret); + btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret); return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 92303f42baaa..bab0b84d8f80 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -125,7 +125,7 @@ static void btrfs_kobject_uevent(struct block_device *bdev, ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); if (ret) - pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n", + pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), &disk_to_dev(bdev->bd_disk)->kobj); @@ -200,7 +200,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, if (IS_ERR(*bdev)) { ret = PTR_ERR(*bdev); - printk(KERN_INFO "btrfs: open %s failed\n", device_path); + printk(KERN_INFO "BTRFS: open %s failed\n", device_path); goto error; } @@ -912,9 +912,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, if (disk_super->label[0]) { if (disk_super->label[BTRFS_LABEL_SIZE - 1]) disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; - printk(KERN_INFO "btrfs: device label %s ", disk_super->label); + printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); } else { - printk(KERN_INFO "btrfs: device fsid %pU ", disk_super->fsid); + printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); } printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); @@ -1813,7 +1813,7 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, } if (!*device) { - pr_err("btrfs: no missing device found\n"); + btrfs_err(root->fs_info, "no missing device found"); return -ENOENT; } @@ -3052,7 +3052,7 @@ loop: error: btrfs_free_path(path); if (enospc_errors) { - printk(KERN_INFO "btrfs: %d enospc errors during balance\n", + btrfs_info(fs_info, "%d enospc errors during balance", enospc_errors); if (!ret) ret = -ENOSPC; @@ -3138,8 +3138,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if (!(bctl->flags & BTRFS_BALANCE_DATA) || !(bctl->flags & BTRFS_BALANCE_METADATA) || memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { - printk(KERN_ERR "btrfs: with mixed groups data and " - "metadata balance options must be the same\n"); + btrfs_err(fs_info, "with mixed groups data and " + "metadata balance options must be the same"); ret = -EINVAL; goto out; } @@ -3165,8 +3165,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->data.target, 1) || (bctl->data.target & ~allowed))) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "data profile %llu\n", + btrfs_err(fs_info, "unable to start balance with target " + "data profile %llu", bctl->data.target); ret = -EINVAL; goto out; @@ -3174,8 +3174,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->meta.target, 1) || (bctl->meta.target & ~allowed))) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "metadata profile %llu\n", + btrfs_err(fs_info, + "unable to start balance with target metadata profile %llu", bctl->meta.target); ret = -EINVAL; goto out; @@ -3183,8 +3183,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->sys.target, 1) || (bctl->sys.target & ~allowed))) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "system profile %llu\n", + btrfs_err(fs_info, + "unable to start balance with target system profile %llu", bctl->sys.target); ret = -EINVAL; goto out; @@ -3193,7 +3193,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, /* allow dup'ed data chunks only in mixed mode */ if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { - printk(KERN_ERR "btrfs: dup for data is not allowed\n"); + btrfs_err(fs_info, "dup for data is not allowed"); ret = -EINVAL; goto out; } @@ -3213,11 +3213,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, (fs_info->avail_metadata_alloc_bits & allowed) && !(bctl->meta.target & allowed))) { if (bctl->flags & BTRFS_BALANCE_FORCE) { - printk(KERN_INFO "btrfs: force reducing metadata " - "integrity\n"); + btrfs_info(fs_info, "force reducing metadata integrity"); } else { - printk(KERN_ERR "btrfs: balance will reduce metadata " - "integrity, use force if you want this\n"); + btrfs_err(fs_info, "balance will reduce metadata " + "integrity, use force if you want this"); ret = -EINVAL; goto out; } @@ -3303,7 +3302,7 @@ static int balance_kthread(void *data) mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: continuing balance\n"); + btrfs_info(fs_info, "continuing balance"); ret = btrfs_balance(fs_info->balance_ctl, NULL); } @@ -3325,7 +3324,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->balance_lock); if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { - printk(KERN_INFO "btrfs: force skipping balance\n"); + btrfs_info(fs_info, "force skipping balance"); return 0; } @@ -3543,7 +3542,7 @@ update_tree: BTRFS_UUID_KEY_SUBVOL, key.objectid); if (ret < 0) { - pr_warn("btrfs: uuid_tree_add failed %d\n", + btrfs_warn(fs_info, "uuid_tree_add failed %d", ret); break; } @@ -3555,7 +3554,7 @@ update_tree: BTRFS_UUID_KEY_RECEIVED_SUBVOL, key.objectid); if (ret < 0) { - pr_warn("btrfs: uuid_tree_add failed %d\n", + btrfs_warn(fs_info, "uuid_tree_add failed %d", ret); break; } @@ -3590,7 +3589,7 @@ out: if (trans && !IS_ERR(trans)) btrfs_end_transaction(trans, fs_info->uuid_root); if (ret) - pr_warn("btrfs: btrfs_uuid_scan_kthread failed %d\n", ret); + btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); else fs_info->update_uuid_tree_gen = 1; up(&fs_info->uuid_tree_rescan_sem); @@ -3654,7 +3653,7 @@ static int btrfs_uuid_rescan_kthread(void *data) */ ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); if (ret < 0) { - pr_warn("btrfs: iterating uuid_tree failed %d\n", ret); + btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); up(&fs_info->uuid_tree_rescan_sem); return ret; } @@ -3695,7 +3694,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); if (IS_ERR(task)) { /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - pr_warn("btrfs: failed to start uuid_scan task\n"); + btrfs_warn(fs_info, "failed to start uuid_scan task"); up(&fs_info->uuid_tree_rescan_sem); return PTR_ERR(task); } @@ -3711,7 +3710,7 @@ int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); if (IS_ERR(task)) { /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - pr_warn("btrfs: failed to start uuid_rescan task\n"); + btrfs_warn(fs_info, "failed to start uuid_rescan task"); up(&fs_info->uuid_tree_rescan_sem); return PTR_ERR(task); } @@ -4033,7 +4032,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = 32 * 1024 * 1024; max_chunk_size = 2 * max_stripe_size; } else { - printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", + btrfs_err(info, "invalid chunk type 0x%llx requested\n", type); BUG_ON(1); } @@ -4065,7 +4064,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (!device->writeable) { WARN(1, KERN_ERR - "btrfs: read-only device in alloc_list\n"); + "BTRFS: read-only device in alloc_list\n"); continue; } @@ -5193,13 +5192,13 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, read_unlock(&em_tree->lock); if (!em) { - printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n", + printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n", chunk_start); return -EIO; } if (em->start != chunk_start) { - printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n", + printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n", em->start, chunk_start); free_extent_map(em); return -EIO; @@ -5298,6 +5297,13 @@ static void btrfs_end_bio(struct bio *bio, int err) bio_put(bio); bio = bbio->orig_bio; } + + /* + * We have original bio now. So increment bi_remaining to + * account for it in endio + */ + atomic_inc(&bio->bi_remaining); + bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; @@ -5411,7 +5417,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio, if (!q->merge_bvec_fn) return 1; - bvm.bi_size = bio->bi_size - prev->bv_len; + bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len; if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) return 0; return 1; @@ -5426,7 +5432,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, bio->bi_private = bbio; btrfs_io_bio(bio)->stripe_index = dev_nr; bio->bi_end_io = btrfs_end_bio; - bio->bi_sector = physical >> 9; + bio->bi_iter.bi_sector = physical >> 9; #ifdef DEBUG { struct rcu_string *name; @@ -5464,7 +5470,7 @@ again: while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset) < bvec->bv_len) { - u64 len = bio->bi_size; + u64 len = bio->bi_iter.bi_size; atomic_inc(&bbio->stripes_pending); submit_stripe_bio(root, bbio, bio, physical, dev_nr, @@ -5486,7 +5492,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; - bio->bi_sector = logical >> 9; + bio->bi_iter.bi_sector = logical >> 9; kfree(bbio); bio_endio(bio, -EIO); } @@ -5497,7 +5503,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, { struct btrfs_device *dev; struct bio *first_bio = bio; - u64 logical = (u64)bio->bi_sector << 9; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; u64 *raid_map = NULL; @@ -5506,7 +5512,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, int total_devs = 1; struct btrfs_bio *bbio = NULL; - length = bio->bi_size; + length = bio->bi_iter.bi_size; map_length = length; ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, @@ -6123,7 +6129,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, BUG_ON(!path); ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", + printk_in_rcu(KERN_WARNING "BTRFS: " + "error %d while searching for dev_stats item for device %s!\n", ret, rcu_str_deref(device->name)); goto out; } @@ -6133,7 +6140,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", + printk_in_rcu(KERN_WARNING "BTRFS: " + "delete too small dev_stats item for device %s failed %d!\n", rcu_str_deref(device->name), ret); goto out; } @@ -6146,7 +6154,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", + printk_in_rcu(KERN_WARNING "BTRFS: " + "insert dev_stats item for device %s failed %d!\n", rcu_str_deref(device->name), ret); goto out; } @@ -6199,16 +6208,14 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { if (!dev->dev_stats_valid) return; - printk_ratelimited_in_rcu(KERN_ERR - "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), - btrfs_dev_stat_read(dev, - BTRFS_DEV_STAT_CORRUPTION_ERRS), - btrfs_dev_stat_read(dev, - BTRFS_DEV_STAT_GENERATION_ERRS)); + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); } static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) @@ -6221,7 +6228,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) if (i == BTRFS_DEV_STAT_VALUES_MAX) return; /* all values == 0, suppress message */ - printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + printk_in_rcu(KERN_INFO "BTRFS: " + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), @@ -6242,12 +6250,10 @@ int btrfs_get_dev_stats(struct btrfs_root *root, mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { - printk(KERN_WARNING - "btrfs: get dev_stats failed, device not found\n"); + btrfs_warn(root->fs_info, "get dev_stats failed, device not found"); return -ENODEV; } else if (!dev->dev_stats_valid) { - printk(KERN_WARNING - "btrfs: get dev_stats failed, not yet valid\n"); + btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid"); return -ENODEV; } else if (stats->flags & BTRFS_DEV_STATS_RESET) { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3d1c301c9260..ad8328d797ea 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -28,6 +28,7 @@ #include "transaction.h" #include "xattr.h" #include "disk-io.h" +#include "props.h" ssize_t __btrfs_getxattr(struct inode *inode, const char *name, @@ -332,7 +333,8 @@ static bool btrfs_is_valid_xattr(const char *name) XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) || + !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN); } ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, @@ -374,6 +376,10 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!btrfs_is_valid_xattr(name)) return -EOPNOTSUPP; + if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + return btrfs_set_prop(dentry->d_inode, name, + value, size, flags); + if (size == 0) value = ""; /* empty EA, do not remove */ @@ -403,6 +409,10 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) if (!btrfs_is_valid_xattr(name)) return -EOPNOTSUPP; + if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + return btrfs_set_prop(dentry->d_inode, name, + NULL, 0, XATTR_REPLACE); + return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, XATTR_REPLACE); } diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 9acb846c3e7f..8e57191950cb 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws, *total_in = 0; if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { - printk(KERN_WARNING "btrfs: deflateInit failed\n"); + printk(KERN_WARNING "BTRFS: deflateInit failed\n"); ret = -1; goto out; } @@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws, while (workspace->def_strm.total_in < len) { ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); if (ret != Z_OK) { - printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n", + printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); zlib_deflateEnd(&workspace->def_strm); ret = -1; @@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, } if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "btrfs: inflateInit failed\n"); + printk(KERN_WARNING "BTRFS: inflateInit failed\n"); return -1; } while (workspace->inf_strm.total_in < srclen) { @@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, } if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "btrfs: inflateInit failed\n"); + printk(KERN_WARNING "BTRFS: inflateInit failed\n"); return -1; } diff --git a/fs/buffer.c b/fs/buffer.c index 6024877335ca..27265a8b43c1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -654,14 +654,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); static void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { - spin_lock_irq(&mapping->tree_lock); + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } @@ -1312,7 +1314,7 @@ static void bh_lru_install(struct buffer_head *bh) } while (out < BH_LRU_SIZE) bhs[out++] = NULL; - memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); + memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); } bh_lru_unlock(); @@ -2982,11 +2984,11 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) * let it through, and the IO layer will turn it into * an EIO. */ - if (unlikely(bio->bi_sector >= maxsector)) + if (unlikely(bio->bi_iter.bi_sector >= maxsector)) return; - maxsector -= bio->bi_sector; - bytes = bio->bi_size; + maxsector -= bio->bi_iter.bi_sector; + bytes = bio->bi_iter.bi_size; if (likely((bytes >> 9) <= maxsector)) return; @@ -2994,7 +2996,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) bytes = maxsector << 9; /* Truncate the bio.. */ - bio->bi_size = bytes; + bio->bi_iter.bi_size = bytes; bio->bi_io_vec[0].bv_len = bytes; /* ..and clear the end of the buffer for reads */ @@ -3029,14 +3031,14 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) */ bio = bio_alloc(GFP_NOIO, 1); - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_io_vec[0].bv_page = bh->b_page; bio->bi_io_vec[0].bv_len = bh->b_size; bio->bi_io_vec[0].bv_offset = bh_offset(bh); bio->bi_vcnt = 1; - bio->bi_size = bh->b_size; + bio->bi_iter.bi_size = bh->b_size; bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index f6911284c9bd..21887d63dad5 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -54,11 +54,6 @@ static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, return acl; } -void ceph_forget_all_cached_acls(struct inode *inode) -{ - forget_all_cached_acls(inode); -} - struct posix_acl *ceph_get_acl(struct inode *inode, int type) { int size; @@ -66,13 +61,6 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type) char *value = NULL; struct posix_acl *acl; - if (!IS_POSIXACL(inode)) - return NULL; - - acl = ceph_get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; @@ -107,14 +95,14 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type) return acl; } -static int ceph_set_acl(struct dentry *dentry, struct inode *inode, - struct posix_acl *acl, int type) +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int ret = 0, size = 0; const char *name = NULL; char *value = NULL; struct iattr newattrs; umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; + struct dentry *dentry; if (acl) { ret = posix_acl_valid(acl); @@ -158,30 +146,29 @@ static int ceph_set_acl(struct dentry *dentry, struct inode *inode, goto out_free; } + dentry = d_find_alias(inode); if (new_mode != old_mode) { newattrs.ia_mode = new_mode; newattrs.ia_valid = ATTR_MODE; ret = ceph_setattr(dentry, &newattrs); if (ret) - goto out_free; + goto out_dput; } - if (value) - ret = __ceph_setxattr(dentry, name, value, size, 0); - else - ret = __ceph_removexattr(dentry, name); - + ret = __ceph_setxattr(dentry, name, value, size, 0); if (ret) { if (new_mode != old_mode) { newattrs.ia_mode = old_mode; newattrs.ia_valid = ATTR_MODE; ceph_setattr(dentry, &newattrs); } - goto out_free; + goto out_dput; } ceph_set_cached_acl(inode, type, acl); +out_dput: + dput(dentry); out_free: kfree(value); out: @@ -190,42 +177,24 @@ out: int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) { - struct posix_acl *acl = NULL; - int ret = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (IS_POSIXACL(dir)) { - acl = ceph_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) { - ret = PTR_ERR(acl); - goto out; - } - } + struct posix_acl *default_acl, *acl; + int error; - if (!acl) - inode->i_mode &= ~current_umask(); - } + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; - if (IS_POSIXACL(dir) && acl) { - if (S_ISDIR(inode->i_mode)) { - ret = ceph_set_acl(dentry, inode, acl, - ACL_TYPE_DEFAULT); - if (ret) - goto out_release; - } - ret = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (ret < 0) - goto out; - else if (ret > 0) - ret = ceph_set_acl(dentry, inode, acl, ACL_TYPE_ACCESS); - else - cache_no_acl(inode); - } else { + if (!default_acl && !acl) cache_no_acl(inode); - } -out_release: - posix_acl_release(acl); -out: - return ret; + if (default_acl) { + error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + return error; } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 619616d585b0..45eda6d7a40c 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -100,6 +100,14 @@ static unsigned fpos_off(loff_t p) return p & 0xffffffff; } +static int fpos_cmp(loff_t l, loff_t r) +{ + int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r)); + if (v) + return v; + return (int)(fpos_off(l) - fpos_off(r)); +} + /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on @@ -156,7 +164,7 @@ more: if (!d_unhashed(dentry) && dentry->d_inode && ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && - ctx->pos <= di->offset) + fpos_cmp(ctx->pos, di->offset) <= 0) break; dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, dentry->d_name.len, dentry->d_name.name, di->offset, @@ -695,9 +703,8 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, ceph_mdsc_put_request(req); if (!err) - err = ceph_init_acl(dentry, dentry->d_inode, dir); - - if (err) + ceph_init_acl(dentry, dentry->d_inode, dir); + else d_drop(dentry); return err; } @@ -735,7 +742,9 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); - if (err) + if (!err) + ceph_init_acl(dentry, dentry->d_inode, dir); + else d_drop(dentry); return err; } @@ -776,7 +785,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); out: - if (err < 0) + if (!err) + ceph_init_acl(dentry, dentry->d_inode, dir); + else d_drop(dentry); return err; } @@ -1303,6 +1314,7 @@ const struct inode_operations ceph_dir_iops = { .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, .mknod = ceph_mknod, .symlink = ceph_symlink, .mkdir = ceph_mkdir, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index dfd2ce3419f8..09c7afe32e49 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -286,6 +286,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } else { dout("atomic_open finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { + ceph_init_acl(dentry, dentry->d_inode, dir); *opened |= FILE_CREATED; } err = finish_open(file, dentry, ceph_open, opened); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8b8b506636cc..32d519d8a2e2 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -97,6 +97,7 @@ const struct inode_operations ceph_file_iops = { .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, }; @@ -1616,6 +1617,7 @@ static const struct inode_operations ceph_symlink_iops = { .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, }; /* diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 2df963f1cf5a..10a4ccbf38da 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -144,7 +144,11 @@ enum { Opt_ino32, Opt_noino32, Opt_fscache, - Opt_nofscache + Opt_nofscache, +#ifdef CONFIG_CEPH_FS_POSIX_ACL + Opt_acl, +#endif + Opt_noacl }; static match_table_t fsopt_tokens = { @@ -172,6 +176,10 @@ static match_table_t fsopt_tokens = { {Opt_noino32, "noino32"}, {Opt_fscache, "fsc"}, {Opt_nofscache, "nofsc"}, +#ifdef CONFIG_CEPH_FS_POSIX_ACL + {Opt_acl, "acl"}, +#endif + {Opt_noacl, "noacl"}, {-1, NULL} }; @@ -271,6 +279,14 @@ static int parse_fsopt_token(char *c, void *private) case Opt_nofscache: fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; break; +#ifdef CONFIG_CEPH_FS_POSIX_ACL + case Opt_acl: + fsopt->sb_flags |= MS_POSIXACL; + break; +#endif + case Opt_noacl: + fsopt->sb_flags &= ~MS_POSIXACL; + break; default: BUG_ON(token); } @@ -438,6 +454,13 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) else seq_puts(m, ",nofsc"); +#ifdef CONFIG_CEPH_FS_POSIX_ACL + if (fsopt->sb_flags & MS_POSIXACL) + seq_puts(m, ",acl"); + else + seq_puts(m, ",noacl"); +#endif + if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_RSIZE_DEFAULT) @@ -819,9 +842,6 @@ static int ceph_set_super(struct super_block *s, void *data) s->s_flags = fsc->mount_options->sb_flags; s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ -#ifdef CONFIG_CEPH_FS_POSIX_ACL - s->s_flags |= MS_POSIXACL; -#endif s->s_xattr = ceph_xattr_handlers; s->s_fs_info = fsc; @@ -911,6 +931,10 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, struct ceph_options *opt = NULL; dout("ceph_mount\n"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + flags |= MS_POSIXACL; +#endif err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); if (err < 0) { res = ERR_PTR(err); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 345933948b6e..d8801a95b685 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -13,6 +13,7 @@ #include <linux/wait.h> #include <linux/writeback.h> #include <linux/slab.h> +#include <linux/posix_acl.h> #include <linux/ceph/libceph.h> @@ -741,12 +742,18 @@ extern const struct xattr_handler *ceph_xattr_handlers[]; #ifdef CONFIG_CEPH_FS_POSIX_ACL struct posix_acl *ceph_get_acl(struct inode *, int); +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); int ceph_init_acl(struct dentry *, struct inode *, struct inode *); -void ceph_forget_all_cached_acls(struct inode *inode); + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ + forget_all_cached_acls(inode); +} #else #define ceph_get_acl NULL +#define ceph_set_acl NULL static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 898b6565ad3e..a55ec37378c6 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -12,6 +12,9 @@ #define XATTR_CEPH_PREFIX "ceph." #define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) +static int __remove_xattr(struct ceph_inode_info *ci, + struct ceph_inode_xattr *xattr); + /* * List of handlers for synthetic system.* attributes. Other * attributes are handled directly. @@ -319,8 +322,7 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, static int __set_xattr(struct ceph_inode_info *ci, const char *name, int name_len, const char *val, int val_len, - int dirty, - int should_free_name, int should_free_val, + int flags, int update_xattr, struct ceph_inode_xattr **newxattr) { struct rb_node **p; @@ -349,12 +351,31 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr = NULL; } + if (update_xattr) { + int err = 0; + if (xattr && (flags & XATTR_CREATE)) + err = -EEXIST; + else if (!xattr && (flags & XATTR_REPLACE)) + err = -ENODATA; + if (err) { + kfree(name); + kfree(val); + return err; + } + if (update_xattr < 0) { + if (xattr) + __remove_xattr(ci, xattr); + kfree(name); + return 0; + } + } + if (!xattr) { new = 1; xattr = *newxattr; xattr->name = name; xattr->name_len = name_len; - xattr->should_free_name = should_free_name; + xattr->should_free_name = update_xattr; ci->i_xattrs.count++; dout("__set_xattr count=%d\n", ci->i_xattrs.count); @@ -364,7 +385,7 @@ static int __set_xattr(struct ceph_inode_info *ci, if (xattr->should_free_val) kfree((void *)xattr->val); - if (should_free_name) { + if (update_xattr) { kfree((void *)name); name = xattr->name; } @@ -379,8 +400,8 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr->val = ""; xattr->val_len = val_len; - xattr->dirty = dirty; - xattr->should_free_val = (val && should_free_val); + xattr->dirty = update_xattr; + xattr->should_free_val = (val && update_xattr); if (new) { rb_link_node(&xattr->node, parent, p); @@ -442,7 +463,7 @@ static int __remove_xattr(struct ceph_inode_info *ci, struct ceph_inode_xattr *xattr) { if (!xattr) - return -EOPNOTSUPP; + return -ENODATA; rb_erase(&xattr->node, &ci->i_xattrs.index); @@ -588,7 +609,7 @@ start: p += len; err = __set_xattr(ci, name, namelen, val, len, - 0, 0, 0, &xattrs[numattr]); + 0, 0, &xattrs[numattr]); if (err < 0) goto bad; @@ -850,6 +871,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, dout("setxattr value=%.*s\n", (int)size, value); + if (!value) + flags |= CEPH_XATTR_REMOVE; + /* do request */ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, USE_AUTH_MDS); @@ -892,7 +916,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, struct ceph_inode_info *ci = ceph_inode(inode); int issued; int err; - int dirty; + int dirty = 0; int name_len = strlen(name); int val_len = size; char *newname = NULL; @@ -953,12 +977,14 @@ retry: goto retry; } - err = __set_xattr(ci, newname, name_len, newval, - val_len, 1, 1, 1, &xattr); + err = __set_xattr(ci, newname, name_len, newval, val_len, + flags, value ? 1 : -1, &xattr); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); - ci->i_xattrs.dirty = true; - inode->i_ctime = CURRENT_TIME; + if (!err) { + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + ci->i_xattrs.dirty = true; + inode->i_ctime = CURRENT_TIME; + } spin_unlock(&ci->i_ceph_lock); if (dirty) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 51f5e0ee7237..7ff866dbb89e 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -865,8 +865,8 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, return rc; } -static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, - __u16 fid, u32 *pacllen) +struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, + const struct cifs_fid *cifsfid, u32 *pacllen) { struct cifs_ntsd *pntsd = NULL; unsigned int xid; @@ -877,7 +877,8 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, return ERR_CAST(tlink); xid = get_xid(); - rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen); + rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), cifsfid->netfid, &pntsd, + pacllen); free_xid(xid); cifs_put_tlink(tlink); @@ -895,9 +896,10 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, int oplock = 0; unsigned int xid; int rc, create_options = 0; - __u16 fid; struct cifs_tcon *tcon; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_fid fid; + struct cifs_open_parms oparms; if (IS_ERR(tlink)) return ERR_CAST(tlink); @@ -908,12 +910,19 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, - create_options, &fid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = READ_CONTROL; + oparms.create_options = create_options; + oparms.disposition = FILE_OPEN; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (!rc) { - rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); - CIFSSMBClose(xid, tcon, fid); + rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen); + CIFSSMBClose(xid, tcon, fid.netfid); } cifs_put_tlink(tlink); @@ -938,7 +947,7 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, if (!open_file) return get_cifs_acl_by_path(cifs_sb, path, pacllen); - pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->fid.netfid, pacllen); + pntsd = get_cifs_acl_by_fid(cifs_sb, &open_file->fid, pacllen); cifsFileInfo_put(open_file); return pntsd; } @@ -950,10 +959,11 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, int oplock = 0; unsigned int xid; int rc, access_flags, create_options = 0; - __u16 fid; struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_fid fid; + struct cifs_open_parms oparms; if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -969,18 +979,25 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, else access_flags = WRITE_DAC; - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags, - create_options, &fid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = access_flags; + oparms.create_options = create_options; + oparms.disposition = FILE_OPEN; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) { cifs_dbg(VFS, "Unable to open file to set ACL\n"); goto out; } - rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag); + rc = CIFSSMBSetCIFSACL(xid, tcon, fid.netfid, pnntsd, acllen, aclflag); cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc); - CIFSSMBClose(xid, tcon, fid); + CIFSSMBClose(xid, tcon, fid.netfid); out: free_xid(xid); cifs_put_tlink(tlink); @@ -990,19 +1007,31 @@ out: /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, - struct inode *inode, const char *path, const __u16 *pfid) + struct inode *inode, const char *path, + const struct cifs_fid *pfid) { struct cifs_ntsd *pntsd = NULL; u32 acllen = 0; int rc = 0; + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_tcon *tcon; cifs_dbg(NOISY, "converting ACL to mode for %s\n", path); - if (pfid) - pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen); - else - pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + if (pfid && (tcon->ses->server->ops->get_acl_by_fid)) + pntsd = tcon->ses->server->ops->get_acl_by_fid(cifs_sb, pfid, + &acllen); + else if (tcon->ses->server->ops->get_acl) + pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, + &acllen); + else { + cifs_put_tlink(tlink); + return -EOPNOTSUPP; + } /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); @@ -1014,6 +1043,8 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc); } + cifs_put_tlink(tlink); + return rc; } @@ -1027,15 +1058,30 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, __u32 secdesclen = 0; struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_tcon *tcon; + + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); cifs_dbg(NOISY, "set ACL from mode for %s\n", path); /* Get the security descriptor */ - pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); + + if (tcon->ses->server->ops->get_acl == NULL) { + cifs_put_tlink(tlink); + return -EOPNOTSUPP; + } + + pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, + &secdesclen); if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); - goto out; + cifs_put_tlink(tlink); + return rc; } /* @@ -1048,6 +1094,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, pnntsd = kmalloc(secdesclen, GFP_KERNEL); if (!pnntsd) { kfree(pntsd); + cifs_put_tlink(tlink); return -ENOMEM; } @@ -1056,14 +1103,18 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); + if (tcon->ses->server->ops->set_acl == NULL) + rc = -EOPNOTSUPP; + if (!rc) { /* Set the security descriptor */ - rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag); + rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode, + path, aclflag); cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } + cifs_put_tlink(tlink); kfree(pnntsd); kfree(pntsd); -out: return rc; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f918a998a087..cf32f0393369 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -323,7 +323,8 @@ struct smb_version_operations { /* async read from the server */ int (*async_readv)(struct cifs_readdata *); /* async write to the server */ - int (*async_writev)(struct cifs_writedata *); + int (*async_writev)(struct cifs_writedata *, + void (*release)(struct kref *)); /* sync read from the server */ int (*sync_read)(const unsigned int, struct cifsFileInfo *, struct cifs_io_parms *, unsigned int *, char **, @@ -370,8 +371,12 @@ struct smb_version_operations { void (*new_lease_key)(struct cifs_fid *); int (*generate_signingkey)(struct cifs_ses *); int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); - int (*query_mf_symlink)(const unsigned char *, char *, unsigned int *, - struct cifs_sb_info *, unsigned int); + int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, + struct cifs_sb_info *, const unsigned char *, + char *, unsigned int *); + int (*create_mf_symlink)(unsigned int, struct cifs_tcon *, + struct cifs_sb_info *, const unsigned char *, + char *, unsigned int *); /* if we can do cache read operations */ bool (*is_read_op)(__u32); /* set oplock level for the inode */ @@ -385,6 +390,18 @@ struct smb_version_operations { struct cifsFileInfo *target_file, u64 src_off, u64 len, u64 dest_off); int (*validate_negotiate)(const unsigned int, struct cifs_tcon *); + ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *, + const unsigned char *, const unsigned char *, char *, + size_t, const struct nls_table *, int); + int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *, + const char *, const void *, const __u16, + const struct nls_table *, int); + struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *, + const char *, u32 *); + struct cifs_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *, + const struct cifs_fid *, u32 *); + int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *, + int); }; struct smb_version_values { @@ -1054,7 +1071,7 @@ struct cifs_writedata { unsigned int pagesz; unsigned int tailsz; unsigned int nr_pages; - struct page *pages[1]; + struct page *pages[]; }; /* diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 2c29db6a247e..acc4ee8ed075 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -151,7 +151,7 @@ extern struct inode *cifs_iget(struct super_block *sb, extern int cifs_get_inode_info(struct inode **inode, const char *full_path, FILE_ALL_INFO *data, struct super_block *sb, - int xid, const __u16 *fid); + int xid, const struct cifs_fid *fid); extern int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *search_path, struct super_block *sb, unsigned int xid); @@ -162,11 +162,13 @@ extern int cifs_rename_pending_delete(const char *full_path, const unsigned int xid); extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, - const char *path, const __u16 *pfid); + const char *path, const struct cifs_fid *pfid); extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64, kuid_t, kgid_t); extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *); +extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, + const struct cifs_fid *, u32 *); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, const char *, int); @@ -362,11 +364,8 @@ extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nls_codepage); extern int CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid); -extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon, - const char *fileName, const int disposition, - const int access_flags, const int omode, - __u16 *netfid, int *pOplock, FILE_ALL_INFO *, - const struct nls_table *nls_codepage, int remap); +extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, + int *oplock, FILE_ALL_INFO *buf); extern int SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const int disposition, const int access_flags, const int omode, @@ -476,8 +475,8 @@ extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask); extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); -extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr); -extern int CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon, +extern bool couldbe_mf_symlink(const struct cifs_fattr *fattr); +extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, const unsigned char *path); @@ -491,12 +490,18 @@ void cifs_readdata_release(struct kref *refcount); int cifs_async_readv(struct cifs_readdata *rdata); int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid); -int cifs_async_writev(struct cifs_writedata *wdata); +int cifs_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)); void cifs_writev_complete(struct work_struct *work); struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete); void cifs_writedata_release(struct kref *refcount); -int open_query_close_cifs_symlink(const unsigned char *path, char *pbuf, - unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, - unsigned int xid); +int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const unsigned char *path, char *pbuf, + unsigned int *pbytes_read); +int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const unsigned char *path, char *pbuf, + unsigned int *pbytes_written); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index d707edb6b852..f3264bd7a83d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1273,104 +1273,124 @@ OldOpenRetry: } int -CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon, - const char *fileName, const int openDisposition, - const int access_flags, const int create_options, __u16 *netfid, - int *pOplock, FILE_ALL_INFO *pfile_info, - const struct nls_table *nls_codepage, int remap) +CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock, + FILE_ALL_INFO *buf) { int rc = -EACCES; - OPEN_REQ *pSMB = NULL; - OPEN_RSP *pSMBr = NULL; + OPEN_REQ *req = NULL; + OPEN_RSP *rsp = NULL; int bytes_returned; int name_len; __u16 count; + struct cifs_sb_info *cifs_sb = oparms->cifs_sb; + struct cifs_tcon *tcon = oparms->tcon; + int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; + const struct nls_table *nls = cifs_sb->local_nls; + int create_options = oparms->create_options; + int desired_access = oparms->desired_access; + int disposition = oparms->disposition; + const char *path = oparms->path; openRetry: - rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **) &pSMB, - (void **) &pSMBr); + rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **)&req, + (void **)&rsp); if (rc) return rc; - pSMB->AndXCommand = 0xFF; /* none */ + /* no commands go after this */ + req->AndXCommand = 0xFF; - if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - count = 1; /* account for one byte pad to word boundary */ - name_len = - cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1), - fileName, PATH_MAX, nls_codepage, remap); - name_len++; /* trailing null */ + if (req->hdr.Flags2 & SMBFLG2_UNICODE) { + /* account for one byte pad to word boundary */ + count = 1; + name_len = cifsConvertToUTF16((__le16 *)(req->fileName + 1), + path, PATH_MAX, nls, remap); + /* trailing null */ + name_len++; name_len *= 2; - pSMB->NameLength = cpu_to_le16(name_len); - } else { /* BB improve check for buffer overruns BB */ - count = 0; /* no pad */ - name_len = strnlen(fileName, PATH_MAX); - name_len++; /* trailing null */ - pSMB->NameLength = cpu_to_le16(name_len); - strncpy(pSMB->fileName, fileName, name_len); + req->NameLength = cpu_to_le16(name_len); + } else { + /* BB improve check for buffer overruns BB */ + /* no pad */ + count = 0; + name_len = strnlen(path, PATH_MAX); + /* trailing null */ + name_len++; + req->NameLength = cpu_to_le16(name_len); + strncpy(req->fileName, path, name_len); } - if (*pOplock & REQ_OPLOCK) - pSMB->OpenFlags = cpu_to_le32(REQ_OPLOCK); - else if (*pOplock & REQ_BATCHOPLOCK) - pSMB->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK); - pSMB->DesiredAccess = cpu_to_le32(access_flags); - pSMB->AllocationSize = 0; - /* set file as system file if special file such - as fifo and server expecting SFU style and - no Unix extensions */ + + if (*oplock & REQ_OPLOCK) + req->OpenFlags = cpu_to_le32(REQ_OPLOCK); + else if (*oplock & REQ_BATCHOPLOCK) + req->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK); + + req->DesiredAccess = cpu_to_le32(desired_access); + req->AllocationSize = 0; + + /* + * Set file as system file if special file such as fifo and server + * expecting SFU style and no Unix extensions. + */ if (create_options & CREATE_OPTION_SPECIAL) - pSMB->FileAttributes = cpu_to_le32(ATTR_SYSTEM); + req->FileAttributes = cpu_to_le32(ATTR_SYSTEM); else - pSMB->FileAttributes = cpu_to_le32(ATTR_NORMAL); + req->FileAttributes = cpu_to_le32(ATTR_NORMAL); - /* XP does not handle ATTR_POSIX_SEMANTICS */ - /* but it helps speed up case sensitive checks for other - servers such as Samba */ + /* + * XP does not handle ATTR_POSIX_SEMANTICS but it helps speed up case + * sensitive checks for other servers such as Samba. + */ if (tcon->ses->capabilities & CAP_UNIX) - pSMB->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS); + req->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS); if (create_options & CREATE_OPTION_READONLY) - pSMB->FileAttributes |= cpu_to_le32(ATTR_READONLY); + req->FileAttributes |= cpu_to_le32(ATTR_READONLY); + + req->ShareAccess = cpu_to_le32(FILE_SHARE_ALL); + req->CreateDisposition = cpu_to_le32(disposition); + req->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK); - pSMB->ShareAccess = cpu_to_le32(FILE_SHARE_ALL); - pSMB->CreateDisposition = cpu_to_le32(openDisposition); - pSMB->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK); /* BB Expirement with various impersonation levels and verify */ - pSMB->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION); - pSMB->SecurityFlags = - SECURITY_CONTEXT_TRACKING | SECURITY_EFFECTIVE_ONLY; + req->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION); + req->SecurityFlags = SECURITY_CONTEXT_TRACKING|SECURITY_EFFECTIVE_ONLY; count += name_len; - inc_rfc1001_len(pSMB, count); + inc_rfc1001_len(req, count); - pSMB->ByteCount = cpu_to_le16(count); - /* long_op set to 1 to allow for oplock break timeouts */ - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *)pSMBr, &bytes_returned, 0); + req->ByteCount = cpu_to_le16(count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *)req, + (struct smb_hdr *)rsp, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_opens); if (rc) { cifs_dbg(FYI, "Error in Open = %d\n", rc); - } else { - *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */ - *netfid = pSMBr->Fid; /* cifs fid stays in le */ - /* Let caller know file was created so we can set the mode. */ - /* Do we care about the CreateAction in any other cases? */ - if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction) - *pOplock |= CIFS_CREATE_ACTION; - if (pfile_info) { - memcpy((char *)pfile_info, (char *)&pSMBr->CreationTime, - 36 /* CreationTime to Attributes */); - /* the file_info buf is endian converted by caller */ - pfile_info->AllocationSize = pSMBr->AllocationSize; - pfile_info->EndOfFile = pSMBr->EndOfFile; - pfile_info->NumberOfLinks = cpu_to_le32(1); - pfile_info->DeletePending = 0; - } + cifs_buf_release(req); + if (rc == -EAGAIN) + goto openRetry; + return rc; } - cifs_buf_release(pSMB); - if (rc == -EAGAIN) - goto openRetry; + /* 1 byte no need to le_to_cpu */ + *oplock = rsp->OplockLevel; + /* cifs fid stays in le */ + oparms->fid->netfid = rsp->Fid; + + /* Let caller know file was created so we can set the mode. */ + /* Do we care about the CreateAction in any other cases? */ + if (cpu_to_le32(FILE_CREATE) == rsp->CreateAction) + *oplock |= CIFS_CREATE_ACTION; + + if (buf) { + /* copy from CreationTime to Attributes */ + memcpy((char *)buf, (char *)&rsp->CreationTime, 36); + /* the file_info buf is endian converted by caller */ + buf->AllocationSize = rsp->AllocationSize; + buf->EndOfFile = rsp->EndOfFile; + buf->NumberOfLinks = cpu_to_le32(1); + buf->DeletePending = 0; + } + + cifs_buf_release(req); return rc; } @@ -1890,7 +1910,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata) do { server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata); + rc = server->ops->async_writev(wdata, cifs_writedata_release); } while (rc == -EAGAIN); for (i = 0; i < wdata->nr_pages; i++) { @@ -1942,15 +1962,9 @@ cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) { struct cifs_writedata *wdata; - /* this would overflow */ - if (nr_pages == 0) { - cifs_dbg(VFS, "%s: called with nr_pages == 0!\n", __func__); - return NULL; - } - /* writedata + number of page pointers */ wdata = kzalloc(sizeof(*wdata) + - sizeof(struct page *) * (nr_pages - 1), GFP_NOFS); + sizeof(struct page *) * nr_pages, GFP_NOFS); if (wdata != NULL) { kref_init(&wdata->refcount); INIT_LIST_HEAD(&wdata->list); @@ -2011,7 +2025,8 @@ cifs_writev_callback(struct mid_q_entry *mid) /* cifs_async_writev - send an async write, and set up mid to handle result */ int -cifs_async_writev(struct cifs_writedata *wdata) +cifs_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)) { int rc = -EACCES; WRITE_REQ *smb = NULL; @@ -2085,7 +2100,7 @@ cifs_async_writev(struct cifs_writedata *wdata) if (rc == 0) cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); else - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, release); async_writev_out: cifs_small_buf_release(smb); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index a514e0a65f69..3db0c5fd9a11 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -378,7 +378,7 @@ cifs_create_get_file_info: xid); else { rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, - xid, &fid->netfid); + xid, fid); if (newinode) { if (server->ops->set_lease_key) server->ops->set_lease_key(newinode, fid); @@ -565,12 +565,13 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; - struct cifs_tcon *pTcon; + struct cifs_tcon *tcon; struct cifs_io_parms io_parms; char *full_path = NULL; struct inode *newinode = NULL; int oplock = 0; - u16 fileHandle; + struct cifs_fid fid; + struct cifs_open_parms oparms; FILE_ALL_INFO *buf = NULL; unsigned int bytes_written; struct win_dev *pdev; @@ -583,7 +584,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, if (IS_ERR(tlink)) return PTR_ERR(tlink); - pTcon = tlink_tcon(tlink); + tcon = tlink_tcon(tlink); xid = get_xid(); @@ -593,7 +594,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, goto mknod_out; } - if (pTcon->unix_ext) { + if (tcon->unix_ext) { struct cifs_unix_set_info_args args = { .mode = mode & ~current_umask(), .ctime = NO_CHANGE_64, @@ -608,7 +609,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, args.uid = INVALID_UID; /* no change */ args.gid = INVALID_GID; /* no change */ } - rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args, + rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -640,42 +641,44 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; - rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE, - GENERIC_WRITE, create_options, - &fileHandle, &oplock, buf, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, buf); if (rc) goto mknod_out; - /* BB Do not bother to decode buf since no local inode yet to put - * timestamps in, but we can reuse it safely */ + /* + * BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely. + */ pdev = (struct win_dev *)buf; - io_parms.netfid = fileHandle; + io_parms.netfid = fid.netfid; io_parms.pid = current->tgid; - io_parms.tcon = pTcon; + io_parms.tcon = tcon; io_parms.offset = 0; io_parms.length = sizeof(struct win_dev); if (S_ISCHR(mode)) { memcpy(pdev->type, "IntxCHR", 8); - pdev->major = - cpu_to_le64(MAJOR(device_number)); - pdev->minor = - cpu_to_le64(MINOR(device_number)); - rc = CIFSSMBWrite(xid, &io_parms, - &bytes_written, (char *)pdev, - NULL, 0); + pdev->major = cpu_to_le64(MAJOR(device_number)); + pdev->minor = cpu_to_le64(MINOR(device_number)); + rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev, + NULL, 0); } else if (S_ISBLK(mode)) { memcpy(pdev->type, "IntxBLK", 8); - pdev->major = - cpu_to_le64(MAJOR(device_number)); - pdev->minor = - cpu_to_le64(MINOR(device_number)); - rc = CIFSSMBWrite(xid, &io_parms, - &bytes_written, (char *)pdev, - NULL, 0); + pdev->major = cpu_to_le64(MAJOR(device_number)); + pdev->minor = cpu_to_le64(MINOR(device_number)); + rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev, + NULL, 0); } /* else if (S_ISFIFO) */ - CIFSSMBClose(xid, pTcon, fileHandle); + CIFSSMBClose(xid, tcon, fid.netfid); d_drop(direntry); /* FIXME: add code here to set EAs */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5a5a87240fe2..53c15074bb36 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -244,7 +244,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, xid); else rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, - xid, &fid->netfid); + xid, fid); out: kfree(buf); @@ -678,7 +678,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) /* * Can not refresh inode by passing in file_info buf to be returned by - * CIFSSMBOpen and then calling get_inode_info with returned buf since + * ops->open and then calling get_inode_info with returned buf since * file might have write behind data that needs to be flushed and server * version of file size can be stale. If we knew for sure that inode was * not dirty locally we could do this. @@ -2043,7 +2043,8 @@ retry: } wdata->pid = wdata->cfile->pid; server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata); + rc = server->ops->async_writev(wdata, + cifs_writedata_release); } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN); for (i = 0; i < nr_pages; ++i) @@ -2331,9 +2332,20 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len) } static void -cifs_uncached_writev_complete(struct work_struct *work) +cifs_uncached_writedata_release(struct kref *refcount) { int i; + struct cifs_writedata *wdata = container_of(refcount, + struct cifs_writedata, refcount); + + for (i = 0; i < wdata->nr_pages; i++) + put_page(wdata->pages[i]); + cifs_writedata_release(refcount); +} + +static void +cifs_uncached_writev_complete(struct work_struct *work) +{ struct cifs_writedata *wdata = container_of(work, struct cifs_writedata, work); struct inode *inode = wdata->cfile->dentry->d_inode; @@ -2347,12 +2359,7 @@ cifs_uncached_writev_complete(struct work_struct *work) complete(&wdata->done); - if (wdata->result != -EAGAIN) { - for (i = 0; i < wdata->nr_pages; i++) - put_page(wdata->pages[i]); - } - - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, cifs_uncached_writedata_release); } /* attempt to send write to server, retry on any -EAGAIN errors */ @@ -2370,7 +2377,8 @@ cifs_uncached_retry_writev(struct cifs_writedata *wdata) if (rc != 0) continue; } - rc = server->ops->async_writev(wdata); + rc = server->ops->async_writev(wdata, + cifs_uncached_writedata_release); } while (rc == -EAGAIN); return rc; @@ -2381,7 +2389,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *poffset) { unsigned long nr_pages, i; - size_t copied, len, cur_len; + size_t bytes, copied, len, cur_len; ssize_t total_written = 0; loff_t offset; struct iov_iter it; @@ -2436,14 +2444,45 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, save_len = cur_len; for (i = 0; i < nr_pages; i++) { - copied = min_t(const size_t, cur_len, PAGE_SIZE); + bytes = min_t(const size_t, cur_len, PAGE_SIZE); copied = iov_iter_copy_from_user(wdata->pages[i], &it, - 0, copied); + 0, bytes); cur_len -= copied; iov_iter_advance(&it, copied); + /* + * If we didn't copy as much as we expected, then that + * may mean we trod into an unmapped area. Stop copying + * at that point. On the next pass through the big + * loop, we'll likely end up getting a zero-length + * write and bailing out of it. + */ + if (copied < bytes) + break; } cur_len = save_len - cur_len; + /* + * If we have no data to send, then that probably means that + * the copy above failed altogether. That's most likely because + * the address in the iovec was bogus. Set the rc to -EFAULT, + * free anything we allocated and bail out. + */ + if (!cur_len) { + for (i = 0; i < nr_pages; i++) + put_page(wdata->pages[i]); + kfree(wdata); + rc = -EFAULT; + break; + } + + /* + * i + 1 now represents the number of pages we actually used in + * the copy phase above. Bring nr_pages down to that, and free + * any pages that we didn't use. + */ + for ( ; nr_pages > i + 1; nr_pages--) + put_page(wdata->pages[nr_pages - 1]); + wdata->sync_mode = WB_SYNC_ALL; wdata->nr_pages = nr_pages; wdata->offset = (__u64)offset; @@ -2454,7 +2493,8 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); rc = cifs_uncached_retry_writev(wdata); if (rc) { - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, + cifs_uncached_writedata_release); break; } @@ -2496,7 +2536,7 @@ restart_loop: } } list_del_init(&wdata->list); - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, cifs_uncached_writedata_release); } if (total_written > 0) @@ -2559,8 +2599,8 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, if (rc > 0) { ssize_t err; - err = generic_write_sync(file, pos, rc); - if (err < 0 && rc > 0) + err = generic_write_sync(file, iocb->ki_pos - rc, rc); + if (err < 0) rc = err; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 49719b8228e5..aadc2b68678b 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -383,10 +383,10 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - int tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr, - full_path); + int tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr, + full_path); if (tmprc) - cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); + cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } if (*pinode == NULL) { @@ -404,18 +404,20 @@ int cifs_get_inode_info_unix(struct inode **pinode, } static int -cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path, +cifs_sfu_type(struct cifs_fattr *fattr, const char *path, struct cifs_sb_info *cifs_sb, unsigned int xid) { int rc; int oplock = 0; - __u16 netfid; struct tcon_link *tlink; struct cifs_tcon *tcon; + struct cifs_fid fid; + struct cifs_open_parms oparms; struct cifs_io_parms io_parms; char buf[24]; unsigned int bytes_read; char *pbuf; + int buf_type = CIFS_NO_BUFFER; pbuf = buf; @@ -436,62 +438,69 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ, - CREATE_NOT_DIR, &netfid, &oplock, NULL, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - int buf_type = CIFS_NO_BUFFER; - /* Read header */ - io_parms.netfid = netfid; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = 24; - rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, - &buf_type); - if ((rc == 0) && (bytes_read >= 8)) { - if (memcmp("IntxBLK", pbuf, 8) == 0) { - cifs_dbg(FYI, "Block device\n"); - fattr->cf_mode |= S_IFBLK; - fattr->cf_dtype = DT_BLK; - if (bytes_read == 24) { - /* we have enough to decode dev num */ - __u64 mjr; /* major */ - __u64 mnr; /* minor */ - mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); - mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); - fattr->cf_rdev = MKDEV(mjr, mnr); - } - } else if (memcmp("IntxCHR", pbuf, 8) == 0) { - cifs_dbg(FYI, "Char device\n"); - fattr->cf_mode |= S_IFCHR; - fattr->cf_dtype = DT_CHR; - if (bytes_read == 24) { - /* we have enough to decode dev num */ - __u64 mjr; /* major */ - __u64 mnr; /* minor */ - mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); - mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); - fattr->cf_rdev = MKDEV(mjr, mnr); - } - } else if (memcmp("IntxLNK", pbuf, 7) == 0) { - cifs_dbg(FYI, "Symlink\n"); - fattr->cf_mode |= S_IFLNK; - fattr->cf_dtype = DT_LNK; - } else { - fattr->cf_mode |= S_IFREG; /* file? */ - fattr->cf_dtype = DT_REG; - rc = -EOPNOTSUPP; + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_READ; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (rc) { + cifs_put_tlink(tlink); + return rc; + } + + /* Read header */ + io_parms.netfid = fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = 24; + + rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type); + if ((rc == 0) && (bytes_read >= 8)) { + if (memcmp("IntxBLK", pbuf, 8) == 0) { + cifs_dbg(FYI, "Block device\n"); + fattr->cf_mode |= S_IFBLK; + fattr->cf_dtype = DT_BLK; + if (bytes_read == 24) { + /* we have enough to decode dev num */ + __u64 mjr; /* major */ + __u64 mnr; /* minor */ + mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); + mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); + fattr->cf_rdev = MKDEV(mjr, mnr); } + } else if (memcmp("IntxCHR", pbuf, 8) == 0) { + cifs_dbg(FYI, "Char device\n"); + fattr->cf_mode |= S_IFCHR; + fattr->cf_dtype = DT_CHR; + if (bytes_read == 24) { + /* we have enough to decode dev num */ + __u64 mjr; /* major */ + __u64 mnr; /* minor */ + mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); + mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); + fattr->cf_rdev = MKDEV(mjr, mnr); + } + } else if (memcmp("IntxLNK", pbuf, 7) == 0) { + cifs_dbg(FYI, "Symlink\n"); + fattr->cf_mode |= S_IFLNK; + fattr->cf_dtype = DT_LNK; } else { - fattr->cf_mode |= S_IFREG; /* then it is a file */ + fattr->cf_mode |= S_IFREG; /* file? */ fattr->cf_dtype = DT_REG; - rc = -EOPNOTSUPP; /* or some unknown SFU type */ + rc = -EOPNOTSUPP; } - CIFSSMBClose(xid, tcon, netfid); + } else { + fattr->cf_mode |= S_IFREG; /* then it is a file */ + fattr->cf_dtype = DT_REG; + rc = -EOPNOTSUPP; /* or some unknown SFU type */ } + CIFSSMBClose(xid, tcon, fid.netfid); cifs_put_tlink(tlink); return rc; } @@ -518,10 +527,15 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); - rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS", - ea_value, 4 /* size of buf */, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + if (tcon->ses->server->ops->query_all_EAs == NULL) { + cifs_put_tlink(tlink); + return -EOPNOTSUPP; + } + + rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path, + "SETFILEBITS", ea_value, 4 /* size of buf */, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); cifs_put_tlink(tlink); if (rc < 0) return (int)rc; @@ -663,7 +677,7 @@ cgfi_exit: int cifs_get_inode_info(struct inode **inode, const char *full_path, FILE_ALL_INFO *data, struct super_block *sb, int xid, - const __u16 *fid) + const struct cifs_fid *fid) { bool validinum = false; __u16 srchflgs; @@ -800,10 +814,10 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr, - full_path); + tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr, + full_path); if (tmprc) - cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); + cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } if (!*inode) { @@ -1032,7 +1046,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, { int oplock = 0; int rc; - __u16 netfid; + struct cifs_fid fid; + struct cifs_open_parms oparms; struct inode *inode = dentry->d_inode; struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -1055,10 +1070,16 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, goto out; } - rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, - DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, - &netfid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc != 0) goto out; @@ -1079,7 +1100,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, goto out_close; } info_buf->Attributes = cpu_to_le32(dosattr); - rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, + rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid, current->tgid); /* although we would like to mark the file hidden if that fails we will still try to rename it */ @@ -1090,7 +1111,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, } /* rename the file */ - rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls, + rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, NULL, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc != 0) { @@ -1100,7 +1122,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, /* try to set DELETE_ON_CLOSE */ if (!cifsInode->delete_pending) { - rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, + rc = CIFSSMBSetFileDisposition(xid, tcon, true, fid.netfid, current->tgid); /* * some samba versions return -ENOENT when we try to set the @@ -1120,7 +1142,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, } out_close: - CIFSSMBClose(xid, tcon, netfid); + CIFSSMBClose(xid, tcon, fid.netfid); out: kfree(info_buf); cifs_put_tlink(tlink); @@ -1132,13 +1154,13 @@ out: * them anyway. */ undo_rename: - CIFSSMBRenameOpenFile(xid, tcon, netfid, dentry->d_name.name, + CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, dentry->d_name.name, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); undo_setattr: if (dosattr != origattr) { info_buf->Attributes = cpu_to_le32(origattr); - if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, + if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid, current->tgid)) cifsInode->cifsAttrs = origattr; } @@ -1549,7 +1571,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; - __u16 srcfid; + struct cifs_fid fid; + struct cifs_open_parms oparms; int oplock, rc; tlink = cifs_sb_tlink(cifs_sb); @@ -1576,17 +1599,23 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, if (to_dentry->d_parent != from_dentry->d_parent) goto do_rename_exit; + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; /* open the file to be renamed -- we need DELETE perms */ - rc = CIFSSMBOpen(xid, tcon, from_path, FILE_OPEN, DELETE, - CREATE_NOT_DIR, &srcfid, &oplock, NULL, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + oparms.desired_access = DELETE; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = from_path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc == 0) { - rc = CIFSSMBRenameOpenFile(xid, tcon, srcfid, + rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, (const char *) to_dentry->d_name.name, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - CIFSSMBClose(xid, tcon, srcfid); + CIFSSMBClose(xid, tcon, fid.netfid); } do_rename_exit: cifs_put_tlink(tlink); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 92aee08483a5..264ece71bdb2 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -29,6 +29,10 @@ #include "cifs_debug.h" #include "cifs_fs_sb.h" +/* + * M-F Symlink Functions - Begin + */ + #define CIFS_MF_SYMLINK_LEN_OFFSET (4+1) #define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1)) #define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1)) @@ -91,10 +95,8 @@ symlink_hash_err: } static int -CIFSParseMFSymlink(const u8 *buf, - unsigned int buf_len, - unsigned int *_link_len, - char **_link_str) +parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, + char **_link_str) { int rc; unsigned int link_len; @@ -137,7 +139,7 @@ CIFSParseMFSymlink(const u8 *buf, } static int -CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) +format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str) { int rc; unsigned int link_len; @@ -180,190 +182,94 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) return 0; } +bool +couldbe_mf_symlink(const struct cifs_fattr *fattr) +{ + if (!S_ISREG(fattr->cf_mode)) + /* it's not a symlink */ + return false; + + if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE) + /* it's not a symlink */ + return false; + + return true; +} + static int -CIFSCreateMFSymLink(const unsigned int xid, struct cifs_tcon *tcon, - const char *fromName, const char *toName, - struct cifs_sb_info *cifs_sb) +create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *fromName, + const char *toName) { int rc; - int oplock = 0; - int remap; - int create_options = CREATE_NOT_DIR; - __u16 netfid = 0; u8 *buf; unsigned int bytes_written = 0; - struct cifs_io_parms io_parms; - struct nls_table *nls_codepage; - - nls_codepage = cifs_sb->local_nls; - remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; - rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName); - if (rc != 0) { - kfree(buf); - return rc; - } - - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - - rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE, - create_options, &netfid, &oplock, NULL, - nls_codepage, remap); - if (rc != 0) { - kfree(buf); - return rc; - } - - io_parms.netfid = netfid; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + rc = format_mf_symlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName); + if (rc) + goto out; - rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, buf, NULL, 0); - CIFSSMBClose(xid, tcon, netfid); - kfree(buf); - if (rc != 0) - return rc; + rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb, + fromName, buf, &bytes_written); + if (rc) + goto out; if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE) - return -EIO; - - return 0; + rc = -EIO; +out: + kfree(buf); + return rc; } static int -CIFSQueryMFSymLink(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *searchName, char **symlinkinfo, - const struct nls_table *nls_codepage, int remap) +query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char **symlinkinfo) { int rc; - int oplock = 0; - __u16 netfid = 0; - u8 *buf; - char *pbuf; - unsigned int bytes_read = 0; - int buf_type = CIFS_NO_BUFFER; + u8 *buf = NULL; unsigned int link_len = 0; - struct cifs_io_parms io_parms; - FILE_ALL_INFO file_info; - - rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ, - CREATE_NOT_DIR, &netfid, &oplock, &file_info, - nls_codepage, remap); - if (rc != 0) - return rc; - - if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { - CIFSSMBClose(xid, tcon, netfid); - /* it's not a symlink */ - return -EINVAL; - } + unsigned int bytes_read = 0; buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; - pbuf = buf; - io_parms.netfid = netfid; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; - - rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type); - CIFSSMBClose(xid, tcon, netfid); - if (rc != 0) { - kfree(buf); - return rc; - } - - rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo); - kfree(buf); - if (rc != 0) - return rc; - - return 0; -} - -bool -CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr) -{ - if (!(fattr->cf_mode & S_IFREG)) - /* it's not a symlink */ - return false; - if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE) - /* it's not a symlink */ - return false; - - return true; -} - -int -open_query_close_cifs_symlink(const unsigned char *path, char *pbuf, - unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, - unsigned int xid) -{ - int rc; - int oplock = 0; - __u16 netfid = 0; - struct tcon_link *tlink; - struct cifs_tcon *ptcon; - struct cifs_io_parms io_parms; - int buf_type = CIFS_NO_BUFFER; - FILE_ALL_INFO file_info; - - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) - return PTR_ERR(tlink); - ptcon = tlink_tcon(tlink); + if (tcon->ses->server->ops->query_mf_symlink) + rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon, + cifs_sb, path, buf, &bytes_read); + else + rc = -ENOSYS; - rc = CIFSSMBOpen(xid, ptcon, path, FILE_OPEN, GENERIC_READ, - CREATE_NOT_DIR, &netfid, &oplock, &file_info, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc != 0) { - cifs_put_tlink(tlink); - return rc; - } + if (rc) + goto out; - if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { - CIFSSMBClose(xid, ptcon, netfid); - cifs_put_tlink(tlink); - /* it's not a symlink */ - return rc; + if (bytes_read == 0) { /* not a symlink */ + rc = -EINVAL; + goto out; } - io_parms.netfid = netfid; - io_parms.pid = current->tgid; - io_parms.tcon = ptcon; - io_parms.offset = 0; - io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; - - rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type); - CIFSSMBClose(xid, ptcon, netfid); - cifs_put_tlink(tlink); + rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo); +out: + kfree(buf); return rc; } - int -CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, - const unsigned char *path) +check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, + const unsigned char *path) { int rc; u8 *buf = NULL; unsigned int link_len = 0; unsigned int bytes_read = 0; - if (!CIFSCouldBeMFSymlink(fattr)) + if (!couldbe_mf_symlink(fattr)) /* it's not a symlink */ return 0; @@ -372,8 +278,8 @@ CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon, return -ENOMEM; if (tcon->ses->server->ops->query_mf_symlink) - rc = tcon->ses->server->ops->query_mf_symlink(path, buf, - &bytes_read, cifs_sb, xid); + rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon, + cifs_sb, path, buf, &bytes_read); else rc = -ENOSYS; @@ -383,7 +289,7 @@ CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon, if (bytes_read == 0) /* not a symlink */ goto out; - rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL); + rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL); if (rc == -EINVAL) { /* it's not a symlink */ rc = 0; @@ -403,6 +309,95 @@ out: return rc; } +/* + * SMB 1.0 Protocol specific functions + */ + +int +cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char *pbuf, unsigned int *pbytes_read) +{ + int rc; + int oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + struct cifs_io_parms io_parms; + int buf_type = CIFS_NO_BUFFER; + FILE_ALL_INFO file_info; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_READ; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, &file_info); + if (rc) + return rc; + + if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) + /* it's not a symlink */ + goto out; + + io_parms.netfid = fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + + rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type); +out: + CIFSSMBClose(xid, tcon, fid.netfid); + return rc; +} + +int +cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char *pbuf, unsigned int *pbytes_written) +{ + int rc; + int oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + struct cifs_io_parms io_parms; + int create_options = CREATE_NOT_DIR; + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_OPEN; + oparms.path = path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (rc) + return rc; + + io_parms.netfid = fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + + rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0); + CIFSSMBClose(xid, tcon, fid.netfid); + return rc; +} + +/* + * M-F Symlink Functions - End + */ + int cifs_hardlink(struct dentry *old_file, struct inode *inode, struct dentry *direntry) @@ -438,8 +433,10 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, CIFS_MOUNT_MAP_SPECIAL_CHR); else { server = tcon->ses->server; - if (!server->ops->create_hardlink) - return -ENOSYS; + if (!server->ops->create_hardlink) { + rc = -ENOSYS; + goto cifs_hl_exit; + } rc = server->ops->create_hardlink(xid, tcon, from_name, to_name, cifs_sb); if ((rc == -EIO) || (rc == -EINVAL)) @@ -530,15 +527,10 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) * and fallback to UNIX Extensions Symlinks. */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) - rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + rc = query_mf_symlink(xid, tcon, cifs_sb, full_path, + &target_path); - if ((rc != 0) && cap_unix(tcon->ses)) - rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, - cifs_sb->local_nls); - else if (rc != 0 && server->ops->query_symlink) + if (rc != 0 && server->ops->query_symlink) rc = server->ops->query_symlink(xid, tcon, full_path, &target_path, cifs_sb); @@ -587,8 +579,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) /* BB what if DFS and this volume is on different share? BB */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) - rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname, - cifs_sb); + rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname); else if (pTcon->unix_ext) rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, cifs_sb->local_nls); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 5940ecabbe6a..b15862e0f68c 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -749,7 +749,7 @@ static int cifs_filldir(char *find_entry, struct file *file, } if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) && - CIFSCouldBeMFSymlink(&fattr)) + couldbe_mf_symlink(&fattr)) /* * trying to get the type and mode can be slow, * so just call those regular files for now, and mark diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 5f5ba0dc2ee1..526fb89f9230 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -560,17 +560,24 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) { int tmprc; int oplock = 0; - __u16 netfid; + struct cifs_fid fid; + struct cifs_open_parms oparms; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.create_options = 0; + oparms.disposition = FILE_OPEN; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; /* Need to check if this is a symbolic link or not */ - tmprc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, - FILE_READ_ATTRIBUTES, 0, &netfid, &oplock, - NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + tmprc = CIFS_open(xid, &oparms, &oplock, NULL); if (tmprc == -EOPNOTSUPP) *symlink = true; else - CIFSSMBClose(xid, tcon, netfid); + CIFSSMBClose(xid, tcon, fid.netfid); } return rc; @@ -705,12 +712,7 @@ cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, oparms->cifs_sb->local_nls, oparms->cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - return CIFSSMBOpen(xid, oparms->tcon, oparms->path, - oparms->disposition, oparms->desired_access, - oparms->create_options, &oparms->fid->netfid, oplock, - buf, oparms->cifs_sb->local_nls, - oparms->cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + return CIFS_open(xid, oparms, oplock, buf); } static void @@ -761,8 +763,9 @@ smb_set_file_info(struct inode *inode, const char *full_path, { int oplock = 0; int rc; - __u16 netfid; __u32 netpid; + struct cifs_fid fid; + struct cifs_open_parms oparms; struct cifsFileInfo *open_file; struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -772,7 +775,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, /* if the file is already open for write, just use that fileid */ open_file = find_writable_file(cinode, true); if (open_file) { - netfid = open_file->fid.netfid; + fid.netfid = open_file->fid.netfid; netpid = open_file->pid; tcon = tlink_tcon(open_file->tlink); goto set_via_filehandle; @@ -796,12 +799,17 @@ smb_set_file_info(struct inode *inode, const char *full_path, goto out; } - cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); - rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, - SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, - &netfid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES; + oparms.create_options = CREATE_NOT_DIR; + oparms.disposition = FILE_OPEN; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc != 0) { if (rc == -EIO) rc = -EINVAL; @@ -811,12 +819,12 @@ smb_set_file_info(struct inode *inode, const char *full_path, netpid = current->tgid; set_via_filehandle: - rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid); + rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid); if (!rc) cinode->cifsAttrs = le32_to_cpu(buf->Attributes); if (open_file == NULL) - CIFSSMBClose(xid, tcon, netfid); + CIFSSMBClose(xid, tcon, fid.netfid); else cifsFileInfo_put(open_file); out: @@ -908,33 +916,80 @@ cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset, } static int +cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, char **symlinkinfo, + const struct nls_table *nls_codepage) +{ +#ifdef CONFIG_CIFS_DFS_UPCALL + int rc; + unsigned int num_referrals = 0; + struct dfs_info3_param *referrals = NULL; + + rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage, + &num_referrals, &referrals, 0); + + if (!rc && num_referrals > 0) { + *symlinkinfo = kstrndup(referrals->node_name, + strlen(referrals->node_name), + GFP_KERNEL); + if (!*symlinkinfo) + rc = -ENOMEM; + free_dfs_info_array(referrals, num_referrals); + } + return rc; +#else /* No DFS support */ + return -EREMOTE; +#endif +} + +static int cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, char **target_path, struct cifs_sb_info *cifs_sb) { int rc; int oplock = 0; - __u16 netfid; + struct cifs_fid fid; + struct cifs_open_parms oparms; cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); - rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, - FILE_READ_ATTRIBUTES, OPEN_REPARSE_POINT, &netfid, - &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + /* Check for unix extensions */ + if (cap_unix(tcon->ses)) { + rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path, + cifs_sb->local_nls); + if (rc == -EREMOTE) + rc = cifs_unix_dfs_readlink(xid, tcon, full_path, + target_path, + cifs_sb->local_nls); + + goto out; + } + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.create_options = OPEN_REPARSE_POINT; + oparms.disposition = FILE_OPEN; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) - return rc; + goto out; - rc = CIFSSMBQuerySymLink(xid, tcon, netfid, target_path, + rc = CIFSSMBQuerySymLink(xid, tcon, fid.netfid, target_path, cifs_sb->local_nls); - if (rc) { - CIFSSMBClose(xid, tcon, netfid); - return rc; - } + if (rc) + goto out_close; convert_delimiter(*target_path, '/'); - CIFSSMBClose(xid, tcon, netfid); - cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); +out_close: + CIFSSMBClose(xid, tcon, fid.netfid); +out: + if (!rc) + cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); return rc; } @@ -1009,8 +1064,18 @@ struct smb_version_operations smb1_operations = { .mand_lock = cifs_mand_lock, .mand_unlock_range = cifs_unlock_range, .push_mand_locks = cifs_push_mandatory_locks, - .query_mf_symlink = open_query_close_cifs_symlink, + .query_mf_symlink = cifs_query_mf_symlink, + .create_mf_symlink = cifs_create_mf_symlink, .is_read_op = cifs_is_read_op, +#ifdef CONFIG_CIFS_XATTR + .query_all_EAs = CIFSSMBQAllEAs, + .set_EA = CIFSSMBSetEA, +#endif /* CIFS_XATTR */ +#ifdef CONFIG_CIFS_ACL + .get_acl = get_cifs_acl, + .get_acl_by_fid = get_cifs_acl_by_fid, + .set_acl = set_cifs_acl, +#endif /* CIFS_ACL */ }; struct smb_version_values smb1_values = { diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index c38350851b08..bc0bb9c34f72 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -57,4 +57,7 @@ #define SMB2_CMACAES_SIZE (16) #define SMB3_SIGNKEY_SIZE (16) +/* Maximum buffer size value we can send with 1 credit */ +#define SMB2_MAX_BUFFER_SIZE 65536 + #endif /* _SMB2_GLOB_H */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 757da3e54d3d..192f51a12cf1 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -182,11 +182,8 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified wsize, or default */ wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); - /* - * limit write size to 2 ** 16, because we don't support multicredit - * requests now. - */ - wsize = min_t(unsigned int, wsize, 2 << 15); + /* set it to the maximum buffer size value we can send with 1 credit */ + wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); return wsize; } @@ -200,11 +197,8 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified rsize, or default */ rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); - /* - * limit write size to 2 ** 16, because we don't support multicredit - * requests now. - */ - rsize = min_t(unsigned int, rsize, 2 << 15); + /* set it to the maximum buffer size value we can send with 1 credit */ + rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); return rsize; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2013234b73ad..860344701067 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -413,7 +413,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* SMB2 only has an extended negflavor */ server->negflavor = CIFS_NEGFLAVOR_EXTENDED; - server->maxBuf = le32_to_cpu(rsp->MaxTransactSize); + /* set it to the maximum buffer size value we can send with 1 credit */ + server->maxBuf = min_t(unsigned int, le32_to_cpu(rsp->MaxTransactSize), + SMB2_MAX_BUFFER_SIZE); server->max_read = le32_to_cpu(rsp->MaxReadSize); server->max_write = le32_to_cpu(rsp->MaxWriteSize); /* BB Do we need to validate the SecurityMode? */ @@ -1890,7 +1892,8 @@ smb2_writev_callback(struct mid_q_entry *mid) /* smb2_async_writev - send an async write, and set up mid to handle result */ int -smb2_async_writev(struct cifs_writedata *wdata) +smb2_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)) { int rc = -EACCES; struct smb2_write_req *req = NULL; @@ -1938,7 +1941,7 @@ smb2_async_writev(struct cifs_writedata *wdata) smb2_writev_callback, wdata, 0); if (rc) { - kref_put(&wdata->refcount, cifs_writedata_release); + kref_put(&wdata->refcount, release); cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); } diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 93adc64666f3..0ce48db20a65 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -123,7 +123,8 @@ extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, extern int smb2_async_readv(struct cifs_readdata *rdata); extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *buf_type); -extern int smb2_async_writev(struct cifs_writedata *wdata); +extern int smb2_async_writev(struct cifs_writedata *wdata, + void (*release)(struct kref *kref)); extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, struct kvec *iov, int n_vec); extern int SMB2_echo(struct TCP_Server_Info *server); diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 09afda4cc58e..5ac836a86b18 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -82,9 +82,11 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) goto remove_ea_exit; ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ - rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL, - (__u16)0, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->set_EA) + rc = pTcon->ses->server->ops->set_EA(xid, pTcon, + full_path, ea_name, NULL, (__u16)0, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } remove_ea_exit: kfree(full_path); @@ -149,18 +151,22 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, cifs_dbg(FYI, "attempt to set cifs inode metadata\n"); ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ - rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, - (__u16)value_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->set_EA) + rc = pTcon->ses->server->ops->set_EA(xid, pTcon, + full_path, ea_name, ea_value, (__u16)value_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto set_ea_exit; ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ - rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, - (__u16)value_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->set_EA) + rc = pTcon->ses->server->ops->set_EA(xid, pTcon, + full_path, ea_name, ea_value, (__u16)value_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, strlen(CIFS_XATTR_CIFS_ACL)) == 0) { #ifdef CONFIG_CIFS_ACL @@ -170,8 +176,12 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, rc = -ENOMEM; } else { memcpy(pacl, ea_value, value_size); - rc = set_cifs_acl(pacl, value_size, - direntry->d_inode, full_path, CIFS_ACL_DACL); + if (pTcon->ses->server->ops->set_acl) + rc = pTcon->ses->server->ops->set_acl(pacl, + value_size, direntry->d_inode, + full_path, CIFS_ACL_DACL); + else + rc = -EOPNOTSUPP; if (rc == 0) /* force revalidate of the inode */ CIFS_I(direntry->d_inode)->time = 0; kfree(pacl); @@ -272,17 +282,21 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, /* revalidate/getattr then populate from inode */ } /* BB add else when above is implemented */ ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ - rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, - buf_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->query_all_EAs) + rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, + full_path, ea_name, ea_value, buf_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ - rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, - buf_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->query_all_EAs) + rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, + full_path, ea_name, ea_value, buf_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, strlen(POSIX_ACL_XATTR_ACCESS)) == 0) { #ifdef CONFIG_CIFS_POSIX @@ -313,8 +327,11 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, u32 acllen; struct cifs_ntsd *pacl; - pacl = get_cifs_acl(cifs_sb, direntry->d_inode, - full_path, &acllen); + if (pTcon->ses->server->ops->get_acl == NULL) + goto get_ea_exit; /* rc already EOPNOTSUPP */ + + pacl = pTcon->ses->server->ops->get_acl(cifs_sb, + direntry->d_inode, full_path, &acllen); if (IS_ERR(pacl)) { rc = PTR_ERR(pacl); cifs_dbg(VFS, "%s: error %zd getting sec desc\n", @@ -400,11 +417,12 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) /* if proc/fs/cifs/streamstoxattr is set then search server for EAs or streams to returns as xattrs */ - rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data, - buf_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->query_all_EAs) + rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, + full_path, NULL, data, buf_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); list_ea_exit: kfree(full_path); free_xid(xid); diff --git a/fs/dcookies.c b/fs/dcookies.c index ab5954b50267..ac44a69fbea9 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -204,7 +204,7 @@ out: } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len) +COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len) { #ifdef __BIG_ENDIAN return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); diff --git a/fs/direct-io.c b/fs/direct-io.c index 0e04142d5962..160a5489a939 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -375,7 +375,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio = bio_alloc(GFP_KERNEL, nr_vecs); bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + bio->bi_iter.bi_sector = first_sector; if (dio->is_async) bio->bi_end_io = dio_bio_end_aio; else @@ -719,7 +719,7 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, if (sdio->bio) { loff_t cur_offset = sdio->cur_page_fs_offset; loff_t bio_next_offset = sdio->logical_offset_in_bio + - sdio->bio->bi_size; + sdio->bio->bi_iter.bi_size; /* * See whether this new request is contiguous with the old. diff --git a/fs/exec.c b/fs/exec.c index e1529b4c79b1..3d78fccdd723 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -748,11 +748,10 @@ EXPORT_SYMBOL(setup_arg_pages); #endif /* CONFIG_MMU */ -struct file *open_exec(const char *name) +static struct file *do_open_exec(struct filename *name) { struct file *file; int err; - struct filename tmp = { .name = name }; static const struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC | MAY_OPEN, @@ -760,7 +759,7 @@ struct file *open_exec(const char *name) .lookup_flags = LOOKUP_FOLLOW, }; - file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags); + file = do_filp_open(AT_FDCWD, name, &open_exec_flags); if (IS_ERR(file)) goto out; @@ -784,6 +783,12 @@ exit: fput(file); return ERR_PTR(err); } + +struct file *open_exec(const char *name) +{ + struct filename tmp = { .name = name }; + return do_open_exec(&tmp); +} EXPORT_SYMBOL(open_exec); int kernel_read(struct file *file, loff_t offset, @@ -1162,7 +1167,7 @@ int prepare_bprm_creds(struct linux_binprm *bprm) return -ENOMEM; } -void free_bprm(struct linux_binprm *bprm) +static void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); if (bprm->cred) { @@ -1432,7 +1437,7 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int do_execve_common(const char *filename, +static int do_execve_common(struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp) { @@ -1441,6 +1446,9 @@ static int do_execve_common(const char *filename, struct files_struct *displaced; int retval; + if (IS_ERR(filename)) + return PTR_ERR(filename); + /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs @@ -1473,7 +1481,7 @@ static int do_execve_common(const char *filename, check_unsafe_exec(bprm); current->in_execve = 1; - file = open_exec(filename); + file = do_open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; @@ -1481,8 +1489,7 @@ static int do_execve_common(const char *filename, sched_exec(); bprm->file = file; - bprm->filename = filename; - bprm->interp = filename; + bprm->filename = bprm->interp = filename->name; retval = bprm_mm_init(bprm); if (retval) @@ -1523,6 +1530,7 @@ static int do_execve_common(const char *filename, acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); + putname(filename); if (displaced) put_files_struct(displaced); return retval; @@ -1544,10 +1552,11 @@ out_files: if (displaced) reset_files_struct(displaced); out_ret: + putname(filename); return retval; } -int do_execve(const char *filename, +int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { @@ -1557,7 +1566,7 @@ int do_execve(const char *filename, } #ifdef CONFIG_COMPAT -static int compat_do_execve(const char *filename, +static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { @@ -1607,25 +1616,13 @@ SYSCALL_DEFINE3(execve, const char __user *const __user *, argv, const char __user *const __user *, envp) { - struct filename *path = getname(filename); - int error = PTR_ERR(path); - if (!IS_ERR(path)) { - error = do_execve(path->name, argv, envp); - putname(path); - } - return error; + return do_execve(getname(filename), argv, envp); } #ifdef CONFIG_COMPAT asmlinkage long compat_sys_execve(const char __user * filename, const compat_uptr_t __user * argv, const compat_uptr_t __user * envp) { - struct filename *path = getname(filename); - int error = PTR_ERR(path); - if (!IS_ERR(path)) { - error = compat_do_execve(path->name, argv, envp); - putname(path); - } - return error; + return compat_do_execve(getname(filename), argv, envp); } #endif diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ece55565b9cd..d3a534fdc5ff 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -771,6 +771,8 @@ do { \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ (einode)->xtime.tv_sec = \ (signed)le32_to_cpu((raw_inode)->xtime); \ + else \ + (einode)->xtime.tv_sec = 0; \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ ext4_decode_extra_time(&(einode)->xtime, \ raw_inode->xtime ## _extra); \ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 10cff4736b11..74bc2d549c58 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3906,6 +3906,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, } else err = ret; map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 43e64f6022eb..1a5073959f32 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -152,7 +152,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, if (ret > 0) { ssize_t err; - err = generic_write_sync(file, pos, ret); + err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0 && ret > 0) ret = err; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 6bea80614d77..a2a837f00407 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -140,7 +140,7 @@ static long swap_inode_boot_loader(struct super_block *sb, handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; - goto swap_boot_out; + goto journal_err_out; } /* Protect extent tree against block allocations via delalloc */ @@ -198,6 +198,7 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_double_up_write_data_sem(inode, inode_bl); +journal_err_out: ext4_inode_resume_unlocked_dio(inode); ext4_inode_resume_unlocked_dio(inode_bl); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index d488f80ee32d..ab95508e3d40 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -65,9 +65,9 @@ static void ext4_finish_bio(struct bio *bio) { int i; int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec; - for (i = 0; i < bio->bi_vcnt; i++) { - struct bio_vec *bvec = &bio->bi_io_vec[i]; + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; struct buffer_head *bh, *head; unsigned bio_start = bvec->bv_offset; @@ -298,7 +298,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) static void ext4_end_bio(struct bio *bio, int error) { ext4_io_end_t *io_end = bio->bi_private; - sector_t bi_sector = bio->bi_sector; + sector_t bi_sector = bio->bi_iter.bi_sector; BUG_ON(!io_end); bio->bi_end_io = NULL; @@ -366,7 +366,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); if (!bio) return -ENOMEM; - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c5adbb318a90..f3b84cd9de56 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -243,6 +243,7 @@ static int ext4_alloc_group_tables(struct super_block *sb, ext4_group_t group; ext4_group_t last_group; unsigned overhead; + __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0; BUG_ON(flex_gd->count == 0 || group_data == NULL); @@ -266,7 +267,7 @@ next_group: src_group++; for (; src_group <= last_group; src_group++) { overhead = ext4_group_overhead_blocks(sb, src_group); - if (overhead != 0) + if (overhead == 0) last_blk += group_data[src_group - group].blocks_count; else break; @@ -280,8 +281,7 @@ next_group: group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + flex_gd->bg_flags[group] &= uninit_mask; } /* Allocate inode bitmaps */ @@ -292,22 +292,30 @@ next_group: group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + flex_gd->bg_flags[group] &= uninit_mask; } /* Allocate inode tables */ for (; it_index < flex_gd->count; it_index++) { - if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) + unsigned int itb = EXT4_SB(sb)->s_itb_per_group; + ext4_fsblk_t next_group_start; + + if (start_blk + itb > last_blk) goto next_group; group_data[it_index].inode_table = start_blk; - group = ext4_get_group_number(sb, start_blk - 1); + group = ext4_get_group_number(sb, start_blk); + next_group_start = ext4_group_first_block_no(sb, group + 1); group -= group_data[0].group; - group_data[group].free_blocks_count -= - EXT4_SB(sb)->s_itb_per_group; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + if (start_blk + itb > next_group_start) { + flex_gd->bg_flags[group + 1] &= uninit_mask; + overhead = start_blk + itb - next_group_start; + group_data[group + 1].free_blocks_count -= overhead; + itb -= overhead; + } + + group_data[group].free_blocks_count -= itb; + flex_gd->bg_flags[group] &= uninit_mask; start_blk += EXT4_SB(sb)->s_itb_per_group; } @@ -401,7 +409,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, start = ext4_group_first_block_no(sb, group); group -= flex_gd->groups[0].group; - count2 = sb->s_blocksize * 8 - (block - start); + count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start); if (count2 > count) count2 = count; @@ -620,7 +628,7 @@ handle_ib: if (err) goto out; count = group_table_count[j]; - start = group_data[i].block_bitmap; + start = (&group_data[i].block_bitmap)[j]; block = start; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1f7784de05b6..710fed2377d4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3695,16 +3695,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; - i = le32_to_cpu(es->s_flags); - if (i & EXT2_FLAGS_UNSIGNED_HASH) - sbi->s_hash_unsigned = 3; - else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ - es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); - sbi->s_hash_unsigned = 3; + if (!(sb->s_flags & MS_RDONLY)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; #else - es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); + if (!(sb->s_flags & MS_RDONLY)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif + } } /* Handle clustersize */ diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0ae558723506..2261ccdd0b5f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -26,40 +26,33 @@ static void f2fs_read_end_io(struct bio *bio, int err) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec; + int i; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (unlikely(!uptodate)) { + if (!err) { + SetPageUptodate(page); + } else { ClearPageUptodate(page); SetPageError(page); - } else { - SetPageUptodate(page); } unlock_page(page); - } while (bvec >= bio->bi_io_vec); - + } bio_put(bio); } static void f2fs_write_end_io(struct bio *bio, int err) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct f2fs_sb_info *sbi = F2FS_SB(bvec->bv_page->mapping->host->i_sb); + struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb); + struct bio_vec *bvec; + int i; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (unlikely(!uptodate)) { + if (unlikely(err)) { SetPageError(page); set_bit(AS_EIO, &page->mapping->flags); set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); @@ -67,7 +60,7 @@ static void f2fs_write_end_io(struct bio *bio, int err) } end_page_writeback(page); dec_page_count(sbi, F2FS_WRITEBACK); - } while (bvec >= bio->bi_io_vec); + } if (bio->bi_private) complete(bio->bi_private); @@ -91,7 +84,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio = bio_alloc(GFP_NOIO, npages); bio->bi_bdev = sbi->sb->s_bdev; - bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; return bio; diff --git a/fs/file.c b/fs/file.c index 771578b33fb6..db25c2bdfe46 100644 --- a/fs/file.c +++ b/fs/file.c @@ -34,7 +34,7 @@ static void *alloc_fdmem(size_t size) * vmalloc() if the allocation size will be considered "large" by the VM. */ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN); + void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY); if (data != NULL) return data; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e0259a163f98..d754e3cf99a8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -40,18 +40,13 @@ struct wb_writeback_work { long nr_pages; struct super_block *sb; - /* - * Write only inodes dirtied before this time. Don't forget to set - * older_than_this_is_set when you set this. - */ - unsigned long older_than_this; + unsigned long *older_than_this; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ - unsigned int older_than_this_is_set:1; enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ @@ -252,10 +247,10 @@ static int move_expired_inodes(struct list_head *delaying_queue, int do_sb_sort = 0; int moved = 0; - WARN_ON_ONCE(!work->older_than_this_is_set); while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (inode_dirtied_after(inode, work->older_than_this)) + if (work->older_than_this && + inode_dirtied_after(inode, *work->older_than_this)) break; list_move(&inode->i_wb_list, &tmp); moved++; @@ -742,8 +737,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, .reason = reason, - .older_than_this = jiffies, - .older_than_this_is_set = 1, }; spin_lock(&wb->list_lock); @@ -802,13 +795,12 @@ static long wb_writeback(struct bdi_writeback *wb, { unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; + unsigned long oldest_jif; struct inode *inode; long progress; - if (!work->older_than_this_is_set) { - work->older_than_this = jiffies; - work->older_than_this_is_set = 1; - } + oldest_jif = jiffies; + work->older_than_this = &oldest_jif; spin_lock(&wb->list_lock); for (;;) { @@ -842,10 +834,10 @@ static long wb_writeback(struct bdi_writeback *wb, * safe. */ if (work->for_kupdate) { - work->older_than_this = jiffies - + oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); } else if (work->for_background) - work->older_than_this = jiffies; + oldest_jif = jiffies; trace_writeback_start(wb->bdi, work); if (list_empty(&wb->b_io)) @@ -1357,21 +1349,18 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb); /** * sync_inodes_sb - sync sb inode pages - * @sb: the superblock - * @older_than_this: timestamp + * @sb: the superblock * * This function writes and waits on any dirty inode belonging to this - * superblock that has been dirtied before given timestamp. + * super_block. */ -void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this) +void sync_inodes_sb(struct super_block *sb) { DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, .nr_pages = LONG_MAX, - .older_than_this = older_than_this, - .older_than_this_is_set = 1, .range_cyclic = 0, .done = &done, .reason = WB_REASON_SYNC, diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index e1959efad64f..b5ebc2d7d80d 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -50,6 +50,8 @@ void fscache_objlist_add(struct fscache_object *obj) struct fscache_object *xobj; struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL; + ASSERT(RB_EMPTY_NODE(&obj->objlist_link)); + write_lock(&fscache_object_list_lock); while (*p) { @@ -75,6 +77,9 @@ void fscache_objlist_add(struct fscache_object *obj) */ void fscache_objlist_remove(struct fscache_object *obj) { + if (RB_EMPTY_NODE(&obj->objlist_link)) + return; + write_lock(&fscache_object_list_lock); BUG_ON(RB_EMPTY_ROOT(&fscache_object_list)); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 53d35c504240..d3b4539f1651 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -314,6 +314,9 @@ void fscache_object_init(struct fscache_object *object, object->cache = cache; object->cookie = cookie; object->parent = NULL; +#ifdef CONFIG_FSCACHE_OBJECT_LIST + RB_CLEAR_NODE(&object->objlist_link); +#endif object->oob_event_mask = 0; for (t = object->oob_table; t->events; t++) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 58f06400b7b8..76693793cedd 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -273,7 +273,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) nrvecs = max(nrvecs/2, 1U); } - bio->bi_sector = blkno * (sb->s_blocksize >> 9); + bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); bio->bi_bdev = sb->s_bdev; bio->bi_end_io = gfs2_end_log_write; bio->bi_private = sdp; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1e712b566d76..c6872d09561a 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -238,7 +238,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) lock_page(page); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_sector = sector * (sb->s_blocksize >> 9); + bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); bio->bi_bdev = sb->s_bdev; bio_add_page(bio, page, PAGE_SIZE, 0); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 9ee62985e739..bdec66522de3 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -529,7 +529,7 @@ const struct inode_operations hfsplus_dir_inode_operations = { .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = hfsplus_listxattr, - .removexattr = hfsplus_removexattr, + .removexattr = generic_removexattr, #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL .get_acl = hfsplus_get_posix_acl, .set_acl = hfsplus_set_posix_acl, diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 4551cbd6bd43..fa929f325f87 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -331,7 +331,7 @@ static const struct inode_operations hfsplus_file_inode_operations = { .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = hfsplus_listxattr, - .removexattr = hfsplus_removexattr, + .removexattr = generic_removexattr, #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL .get_acl = hfsplus_get_posix_acl, .set_acl = hfsplus_set_posix_acl, diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 968eab5bc1f5..68537e8b7a09 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -75,7 +75,7 @@ int hfsplus_parse_options_remount(char *input, int *force) int token; if (!input) - return 0; + return 1; while ((p = strsep(&input, ",")) != NULL) { if (!*p) diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index e9a97a0d4314..3f999649587f 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -63,7 +63,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); bio = bio_alloc(GFP_NOIO, 1); - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio->bi_bdev = sb->s_bdev; if (!(rw & WRITE) && data) diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 0b4a5c9b93c4..4e27edc082a4 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -11,6 +11,8 @@ #include "xattr.h" #include "acl.h" +static int hfsplus_removexattr(struct inode *inode, const char *name); + const struct xattr_handler *hfsplus_xattr_handlers[] = { &hfsplus_xattr_osx_handler, &hfsplus_xattr_user_handler, @@ -274,14 +276,8 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, HFSPLUS_IS_RSRC(inode)) return -EOPNOTSUPP; - if (strncmp(name, XATTR_MAC_OSX_PREFIX, - XATTR_MAC_OSX_PREFIX_LEN) == 0) - name += XATTR_MAC_OSX_PREFIX_LEN; - - if (value == NULL) { - value = ""; - size = 0; - } + if (value == NULL) + return hfsplus_removexattr(inode, name); err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); if (err) { @@ -399,16 +395,11 @@ end_setxattr: return err; } -static inline int is_osx_xattr(const char *xattr_name) -{ - return !is_known_namespace(xattr_name); -} - static int name_len(const char *xattr_name, int xattr_name_len) { int len = xattr_name_len + 1; - if (is_osx_xattr(xattr_name)) + if (!is_known_namespace(xattr_name)) len += XATTR_MAC_OSX_PREFIX_LEN; return len; @@ -419,7 +410,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len) int len = name_len; int offset = 0; - if (is_osx_xattr(xattr_name)) { + if (!is_known_namespace(xattr_name)) { strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN); offset += XATTR_MAC_OSX_PREFIX_LEN; len += XATTR_MAC_OSX_PREFIX_LEN; @@ -497,18 +488,6 @@ ssize_t __hfsplus_getxattr(struct inode *inode, const char *name, HFSPLUS_IS_RSRC(inode)) return -EOPNOTSUPP; - if (strncmp(name, XATTR_MAC_OSX_PREFIX, - XATTR_MAC_OSX_PREFIX_LEN) == 0) { - /* skip "osx." prefix */ - name += XATTR_MAC_OSX_PREFIX_LEN; - /* - * Don't allow retrieving properly prefixed attributes - * by prepending them with "osx." - */ - if (is_known_namespace(name)) - return -EOPNOTSUPP; - } - if (!strcmp_xattr_finder_info(name)) return hfsplus_getxattr_finder_info(inode, value, size); @@ -743,28 +722,18 @@ end_listxattr: return res; } -int hfsplus_removexattr(struct dentry *dentry, const char *name) +static int hfsplus_removexattr(struct inode *inode, const char *name) { int err = 0; - struct inode *inode = dentry->d_inode; struct hfs_find_data cat_fd; u16 flags; u16 cat_entry_type; int is_xattr_acl_deleted = 0; int is_all_xattrs_deleted = 0; - if ((!S_ISREG(inode->i_mode) && - !S_ISDIR(inode->i_mode)) || - HFSPLUS_IS_RSRC(inode)) - return -EOPNOTSUPP; - if (!HFSPLUS_SB(inode->i_sb)->attr_tree) return -EOPNOTSUPP; - if (strncmp(name, XATTR_MAC_OSX_PREFIX, - XATTR_MAC_OSX_PREFIX_LEN) == 0) - name += XATTR_MAC_OSX_PREFIX_LEN; - if (!strcmp_xattr_finder_info(name)) return -EOPNOTSUPP; @@ -838,8 +807,12 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name, if (len > HFSPLUS_ATTR_MAX_STRLEN) return -EOPNOTSUPP; - strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); - strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); + /* + * Don't allow retrieving properly prefixed attributes + * by prepending them with "osx." + */ + if (is_known_namespace(name)) + return -EOPNOTSUPP; return hfsplus_getxattr(dentry, xattr_name, buffer, size); } @@ -857,12 +830,13 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, if (len > HFSPLUS_ATTR_MAX_STRLEN) return -EOPNOTSUPP; + /* + * Don't allow setting properly prefixed attributes + * by prepending them with "osx." + */ if (is_known_namespace(name)) return -EOPNOTSUPP; - strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); - strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); - return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); } diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index 9e214490c313..288530cf80b5 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -40,8 +40,6 @@ static inline ssize_t hfsplus_getxattr(struct dentry *dentry, ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); -int hfsplus_removexattr(struct dentry *dentry, const char *name); - int hfsplus_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr); diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c index cdb84a838068..58b5106186d0 100644 --- a/fs/hpfs/alloc.c +++ b/fs/hpfs/alloc.c @@ -8,6 +8,58 @@ #include "hpfs_fn.h" +static void hpfs_claim_alloc(struct super_block *s, secno sec) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free != (unsigned)-1) { + if (unlikely(!sbi->sb_n_free)) { + hpfs_error(s, "free count underflow, allocating sector %08x", sec); + sbi->sb_n_free = -1; + return; + } + sbi->sb_n_free--; + } +} + +static void hpfs_claim_free(struct super_block *s, secno sec) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free != (unsigned)-1) { + if (unlikely(sbi->sb_n_free >= sbi->sb_fs_size)) { + hpfs_error(s, "free count overflow, freeing sector %08x", sec); + sbi->sb_n_free = -1; + return; + } + sbi->sb_n_free++; + } +} + +static void hpfs_claim_dirband_alloc(struct super_block *s, secno sec) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free_dnodes != (unsigned)-1) { + if (unlikely(!sbi->sb_n_free_dnodes)) { + hpfs_error(s, "dirband free count underflow, allocating sector %08x", sec); + sbi->sb_n_free_dnodes = -1; + return; + } + sbi->sb_n_free_dnodes--; + } +} + +static void hpfs_claim_dirband_free(struct super_block *s, secno sec) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free_dnodes != (unsigned)-1) { + if (unlikely(sbi->sb_n_free_dnodes >= sbi->sb_dirband_size / 4)) { + hpfs_error(s, "dirband free count overflow, freeing sector %08x", sec); + sbi->sb_n_free_dnodes = -1; + return; + } + sbi->sb_n_free_dnodes++; + } +} + /* * Check if a sector is allocated in bitmap * This is really slow. Turned on only if chk==2 @@ -203,9 +255,15 @@ secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forwa } sec = 0; ret: + if (sec) { + i = 0; + do + hpfs_claim_alloc(s, sec + i); + while (unlikely(++i < n)); + } if (sec && f_p) { for (i = 0; i < forward; i++) { - if (!hpfs_alloc_if_possible(s, sec + i + 1)) { + if (!hpfs_alloc_if_possible(s, sec + n + i)) { hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i); sec = 0; break; @@ -228,6 +286,7 @@ static secno alloc_in_dirband(struct super_block *s, secno near) nr >>= 2; sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0); if (!sec) return 0; + hpfs_claim_dirband_alloc(s, sec); return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start; } @@ -242,6 +301,7 @@ int hpfs_alloc_if_possible(struct super_block *s, secno sec) bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f))); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); + hpfs_claim_alloc(s, sec); return 1; } hpfs_brelse4(&qbh); @@ -275,6 +335,7 @@ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n) return; } bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f)); + hpfs_claim_free(s, sec); if (!--n) { hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); @@ -359,6 +420,7 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno) bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f)); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); + hpfs_claim_dirband_free(s, dno); } } @@ -366,7 +428,7 @@ struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near, dnode_secno *dno, struct quad_buffer_head *qbh) { struct dnode *d; - if (hpfs_count_one_bitmap(s, hpfs_sb(s)->sb_dmap) > FREE_DNODES_ADD) { + if (hpfs_get_free_dnodes(s) > FREE_DNODES_ADD) { if (!(*dno = alloc_in_dirband(s, near))) if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL; } else { diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index 4d0a1afa058c..139ef1684d07 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -86,7 +86,6 @@ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh, int ahead) { - struct buffer_head *bh; char *data; hpfs_lock_assert(s); @@ -100,34 +99,32 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe hpfs_prefetch_sectors(s, secno, 4 + ahead); + if (!(qbh->bh[0] = sb_bread(s, secno + 0))) goto bail0; + if (!(qbh->bh[1] = sb_bread(s, secno + 1))) goto bail1; + if (!(qbh->bh[2] = sb_bread(s, secno + 2))) goto bail2; + if (!(qbh->bh[3] = sb_bread(s, secno + 3))) goto bail3; + + if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) && + likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) && + likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) { + return qbh->data = qbh->bh[0]->b_data; + } + qbh->data = data = kmalloc(2048, GFP_NOFS); if (!data) { printk("HPFS: hpfs_map_4sectors: out of memory\n"); - goto bail; + goto bail4; } - qbh->bh[0] = bh = sb_bread(s, secno); - if (!bh) - goto bail0; - memcpy(data, bh->b_data, 512); - - qbh->bh[1] = bh = sb_bread(s, secno + 1); - if (!bh) - goto bail1; - memcpy(data + 512, bh->b_data, 512); - - qbh->bh[2] = bh = sb_bread(s, secno + 2); - if (!bh) - goto bail2; - memcpy(data + 2 * 512, bh->b_data, 512); - - qbh->bh[3] = bh = sb_bread(s, secno + 3); - if (!bh) - goto bail3; - memcpy(data + 3 * 512, bh->b_data, 512); + memcpy(data + 0 * 512, qbh->bh[0]->b_data, 512); + memcpy(data + 1 * 512, qbh->bh[1]->b_data, 512); + memcpy(data + 2 * 512, qbh->bh[2]->b_data, 512); + memcpy(data + 3 * 512, qbh->bh[3]->b_data, 512); return data; + bail4: + brelse(qbh->bh[3]); bail3: brelse(qbh->bh[2]); bail2: @@ -135,9 +132,6 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe bail1: brelse(qbh->bh[0]); bail0: - kfree(data); - printk("HPFS: hpfs_map_4sectors: read error\n"); - bail: return NULL; } @@ -155,44 +149,54 @@ void *hpfs_get_4sectors(struct super_block *s, unsigned secno, return NULL; } - /*return hpfs_map_4sectors(s, secno, qbh, 0);*/ + if (!hpfs_get_sector(s, secno + 0, &qbh->bh[0])) goto bail0; + if (!hpfs_get_sector(s, secno + 1, &qbh->bh[1])) goto bail1; + if (!hpfs_get_sector(s, secno + 2, &qbh->bh[2])) goto bail2; + if (!hpfs_get_sector(s, secno + 3, &qbh->bh[3])) goto bail3; + + if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) && + likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) && + likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) { + return qbh->data = qbh->bh[0]->b_data; + } + if (!(qbh->data = kmalloc(2048, GFP_NOFS))) { printk("HPFS: hpfs_get_4sectors: out of memory\n"); - return NULL; + goto bail4; } - if (!(hpfs_get_sector(s, secno, &qbh->bh[0]))) goto bail0; - if (!(hpfs_get_sector(s, secno + 1, &qbh->bh[1]))) goto bail1; - if (!(hpfs_get_sector(s, secno + 2, &qbh->bh[2]))) goto bail2; - if (!(hpfs_get_sector(s, secno + 3, &qbh->bh[3]))) goto bail3; - memcpy(qbh->data, qbh->bh[0]->b_data, 512); - memcpy(qbh->data + 512, qbh->bh[1]->b_data, 512); - memcpy(qbh->data + 2*512, qbh->bh[2]->b_data, 512); - memcpy(qbh->data + 3*512, qbh->bh[3]->b_data, 512); return qbh->data; - bail3: brelse(qbh->bh[2]); - bail2: brelse(qbh->bh[1]); - bail1: brelse(qbh->bh[0]); - bail0: +bail4: + brelse(qbh->bh[3]); +bail3: + brelse(qbh->bh[2]); +bail2: + brelse(qbh->bh[1]); +bail1: + brelse(qbh->bh[0]); +bail0: return NULL; } void hpfs_brelse4(struct quad_buffer_head *qbh) { - brelse(qbh->bh[3]); - brelse(qbh->bh[2]); - brelse(qbh->bh[1]); + if (unlikely(qbh->data != qbh->bh[0]->b_data)) + kfree(qbh->data); brelse(qbh->bh[0]); - kfree(qbh->data); + brelse(qbh->bh[1]); + brelse(qbh->bh[2]); + brelse(qbh->bh[3]); } void hpfs_mark_4buffers_dirty(struct quad_buffer_head *qbh) { - memcpy(qbh->bh[0]->b_data, qbh->data, 512); - memcpy(qbh->bh[1]->b_data, qbh->data + 512, 512); - memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512); - memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512); + if (unlikely(qbh->data != qbh->bh[0]->b_data)) { + memcpy(qbh->bh[0]->b_data, qbh->data + 0 * 512, 512); + memcpy(qbh->bh[1]->b_data, qbh->data + 1 * 512, 512); + memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512); + memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512); + } mark_buffer_dirty(qbh->bh[0]); mark_buffer_dirty(qbh->bh[1]); mark_buffer_dirty(qbh->bh[2]); diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 6797bf80f6e2..3ba49c080e42 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -312,7 +312,7 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb) __printf(2, 3) void hpfs_error(struct super_block *, const char *, ...); int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *); -unsigned hpfs_count_one_bitmap(struct super_block *, secno); +unsigned hpfs_get_free_dnodes(struct super_block *); /* * local time (HPFS) to GMT (Unix) diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index b8d01ef6f531..4534ff688b76 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -121,7 +121,7 @@ static void hpfs_put_super(struct super_block *s) call_rcu(&hpfs_sb(s)->rcu, lazy_free_sbi); } -unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) +static unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) { struct quad_buffer_head qbh; unsigned long *bits; @@ -129,7 +129,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) bits = hpfs_map_4sectors(s, secno, &qbh, 0); if (!bits) - return 0; + return (unsigned)-1; count = bitmap_weight(bits, 2048 * BITS_PER_BYTE); hpfs_brelse4(&qbh); return count; @@ -144,30 +144,45 @@ static unsigned count_bitmaps(struct super_block *s) hpfs_prefetch_bitmap(s, n); } for (n = 0; n < n_bands; n++) { + unsigned c; hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD); - count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); + c = hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); + if (c != (unsigned)-1) + count += c; } return count; } +unsigned hpfs_get_free_dnodes(struct super_block *s) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + if (sbi->sb_n_free_dnodes == (unsigned)-1) { + unsigned c = hpfs_count_one_bitmap(s, sbi->sb_dmap); + if (c == (unsigned)-1) + return 0; + sbi->sb_n_free_dnodes = c; + } + return sbi->sb_n_free_dnodes; +} + static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); u64 id = huge_encode_dev(s->s_bdev->bd_dev); + hpfs_lock(s); - /*if (sbi->sb_n_free == -1) {*/ + if (sbi->sb_n_free == (unsigned)-1) sbi->sb_n_free = count_bitmaps(s); - sbi->sb_n_free_dnodes = hpfs_count_one_bitmap(s, sbi->sb_dmap); - /*}*/ + buf->f_type = s->s_magic; buf->f_bsize = 512; buf->f_blocks = sbi->sb_fs_size; buf->f_bfree = sbi->sb_n_free; buf->f_bavail = sbi->sb_n_free; buf->f_files = sbi->sb_dirband_size / 4; - buf->f_ffree = sbi->sb_n_free_dnodes; + buf->f_ffree = hpfs_get_free_dnodes(s); buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = 254; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 8360674c85bc..60bb365f54a5 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -514,11 +514,13 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, * similarly constrained call sites */ ret = start_this_handle(journal, handle, GFP_NOFS); - if (ret < 0) + if (ret < 0) { jbd2_journal_free_reserved(handle); + return ret; + } handle->h_type = type; handle->h_line_no = line_no; - return ret; + return 0; } EXPORT_SYMBOL(jbd2_journal_start_reserved); diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index e973b85d6afd..5a8ea16eedbc 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -86,6 +86,8 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, rc = posix_acl_equiv_mode(acl, &inode->i_mode); if (rc < 0) return rc; + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); if (rc == 0) acl = NULL; break; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 360d27c48887..8d811e02b4b9 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1998,20 +1998,20 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio = bio_alloc(GFP_NOFS, 1); - bio->bi_sector = bp->l_blkno << (log->l2bsize - 9); + bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); bio->bi_bdev = log->bdev; bio->bi_io_vec[0].bv_page = bp->l_page; bio->bi_io_vec[0].bv_len = LOGPSIZE; bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_size = LOGPSIZE; + bio->bi_iter.bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; bio->bi_private = bp; /*check if journaling to disk has been disabled*/ if (log->no_integrity) { - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; lbmIODone(bio, 0); } else { submit_bio(READ_SYNC, bio); @@ -2144,21 +2144,21 @@ static void lbmStartIO(struct lbuf * bp) jfs_info("lbmStartIO\n"); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_sector = bp->l_blkno << (log->l2bsize - 9); + bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); bio->bi_bdev = log->bdev; bio->bi_io_vec[0].bv_page = bp->l_page; bio->bi_io_vec[0].bv_len = LOGPSIZE; bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_size = LOGPSIZE; + bio->bi_iter.bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; bio->bi_private = bp; /* check if journaling to disk has been disabled */ if (log->no_integrity) { - bio->bi_size = 0; + bio->bi_iter.bi_size = 0; lbmIODone(bio, 0); } else { submit_bio(WRITE_SYNC, bio); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index d165cde0c68d..49ba7ff1bbb9 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -416,7 +416,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) * count from hitting zero before we're through */ inc_io(page); - if (!bio->bi_size) + if (!bio->bi_iter.bi_size) goto dump_bio; submit_bio(WRITE, bio); nr_underway++; @@ -438,7 +438,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) bio = bio_alloc(GFP_NOFS, 1); bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_sector = pblock << (inode->i_blkbits - 9); + bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_write_end_io; bio->bi_private = page; @@ -452,7 +452,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) if (bio) { if (bio_add_page(bio, page, bio_bytes, bio_offset) < bio_bytes) goto add_failed; - if (!bio->bi_size) + if (!bio->bi_iter.bi_size) goto dump_bio; submit_bio(WRITE, bio); @@ -517,7 +517,8 @@ static int metapage_readpage(struct file *fp, struct page *page) bio = bio_alloc(GFP_NOFS, 1); bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_sector = pblock << (inode->i_blkbits - 9); + bio->bi_iter.bi_sector = + pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_read_end_io; bio->bi_private = page; len = xlen << inode->i_blkbits; diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 5324e4e2b992..46325d5c34fc 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -791,6 +791,19 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, /* Completely new ea list */ xattr_size = sizeof (struct jfs_ea_list); + /* + * The size of EA value is limitted by on-disk format up to + * __le16, there would be an overflow if the size is equal + * to XATTR_SIZE_MAX (65536). In order to avoid this issue, + * we can pre-checkup the value size against USHRT_MAX, and + * return -E2BIG in this case, which is consistent with the + * VFS setxattr interface. + */ + if (value_len >= USHRT_MAX) { + rc = -E2BIG; + goto release; + } + ea = (struct jfs_ea *) ((char *) ealist + xattr_size); ea->flag = 0; ea->namelen = namelen; @@ -805,7 +818,7 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, /* DEBUG - If we did this right, these number match */ if (xattr_size != new_size) { printk(KERN_ERR - "jfs_xsetattr: xattr_size = %d, new_size = %d\n", + "__jfs_setxattr: xattr_size = %d, new_size = %d\n", xattr_size, new_size); rc = -EINVAL; @@ -841,9 +854,6 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, int rc; tid_t tid; - if ((rc = can_set_xattr(inode, name, value, value_len))) - return rc; - /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -852,6 +862,9 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_setxattr(dentry, name, value, value_len, flags); + if ((rc = can_set_xattr(inode, name, value, value_len))) + return rc; + if (value == NULL) { /* empty EA, do not remove */ value = ""; value_len = 0; @@ -1021,9 +1034,6 @@ int jfs_removexattr(struct dentry *dentry, const char *name) int rc; tid_t tid; - if ((rc = can_set_xattr(inode, name, NULL, 0))) - return rc; - /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -1032,6 +1042,9 @@ int jfs_removexattr(struct dentry *dentry, const char *name) if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_removexattr(dentry, name); + if ((rc = can_set_xattr(inode, name, NULL, 0))) + return rc; + tid = txBegin(inode->i_sb, 0); mutex_lock(&ji->commit_mutex); rc = __jfs_setxattr(tid, dentry->d_inode, name, NULL, 0, XATTR_REPLACE); @@ -1048,7 +1061,7 @@ int jfs_removexattr(struct dentry *dentry, const char *name) * attributes are handled directly. */ const struct xattr_handler *jfs_xattr_handlers[] = { -#ifdef JFS_POSIX_ACL +#ifdef CONFIG_JFS_POSIX_ACL &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, #endif diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 5104cf5d25c5..bd6e18be6e1a 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -187,19 +187,23 @@ static void kernfs_deactivate(struct kernfs_node *kn) kn->u.completion = (void *)&wait; - rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); + if (kn->flags & KERNFS_LOCKDEP) + rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); /* atomic_add_return() is a mb(), put_active() will always see * the updated kn->u.completion. */ v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active); if (v != KN_DEACTIVATED_BIAS) { - lock_contended(&kn->dep_map, _RET_IP_); + if (kn->flags & KERNFS_LOCKDEP) + lock_contended(&kn->dep_map, _RET_IP_); wait_for_completion(&wait); } - lock_acquired(&kn->dep_map, _RET_IP_); - rwsem_release(&kn->dep_map, 1, _RET_IP_); + if (kn->flags & KERNFS_LOCKDEP) { + lock_acquired(&kn->dep_map, _RET_IP_); + rwsem_release(&kn->dep_map, 1, _RET_IP_); + } } /** diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 0d6ce895a9ee..0f4152defe7b 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -94,6 +94,7 @@ const void *kernfs_super_ns(struct super_block *sb) * @fs_type: file_system_type of the fs being mounted * @flags: mount flags specified for the mount * @root: kernfs_root of the hierarchy being mounted + * @new_sb_created: tell the caller if we allocated a new superblock * @ns: optional namespace tag of the mount * * This is to be called from each kernfs user's file_system_type->mount() @@ -104,7 +105,8 @@ const void *kernfs_super_ns(struct super_block *sb) * The return value can be passed to the vfs layer verbatim. */ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, const void *ns) + struct kernfs_root *root, bool *new_sb_created, + const void *ns) { struct super_block *sb; struct kernfs_super_info *info; @@ -122,6 +124,10 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, kfree(info); if (IS_ERR(sb)) return ERR_CAST(sb); + + if (new_sb_created) + *new_sb_created = !sb->s_root; + if (!sb->s_root) { error = kernfs_fill_super(sb); if (error) { diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e066a3902973..ab798a88ec1d 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -779,6 +779,7 @@ nlmsvc_grant_blocked(struct nlm_block *block) struct nlm_file *file = block->b_file; struct nlm_lock *lock = &block->b_call->a_args.lock; int error; + loff_t fl_start, fl_end; dprintk("lockd: grant blocked lock %p\n", block); @@ -796,9 +797,16 @@ nlmsvc_grant_blocked(struct nlm_block *block) } /* Try the lock operation again */ + /* vfs_lock_file() can mangle fl_start and fl_end, but we need + * them unchanged for the GRANT_MSG + */ lock->fl.fl_flags |= FL_SLEEP; + fl_start = lock->fl.fl_start; + fl_end = lock->fl.fl_end; error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; + lock->fl.fl_start = fl_start; + lock->fl.fl_end = fl_end; switch (error) { case 0: diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 0f95f0d0b313..76279e11982d 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -26,9 +26,9 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) bio_vec.bv_len = PAGE_SIZE; bio_vec.bv_offset = 0; bio.bi_vcnt = 1; - bio.bi_size = PAGE_SIZE; bio.bi_bdev = bdev; - bio.bi_sector = page->index * (PAGE_SIZE >> 9); + bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9); + bio.bi_iter.bi_size = PAGE_SIZE; return submit_bio_wait(rw, &bio); } @@ -56,22 +56,18 @@ static DECLARE_WAIT_QUEUE_HEAD(wq); static void writeseg_end_io(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec; + int i; struct super_block *sb = bio->bi_private; struct logfs_super *super = logfs_super(sb); - struct page *page; BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ BUG_ON(err); - BUG_ON(bio->bi_vcnt == 0); - do { - page = bvec->bv_page; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - end_page_writeback(page); - page_cache_release(page); - } while (bvec >= bio->bi_io_vec); + + bio_for_each_segment_all(bvec, bio, i) { + end_page_writeback(bvec->bv_page); + page_cache_release(bvec->bv_page); + } bio_put(bio); if (atomic_dec_and_test(&super->s_pending_writes)) wake_up(&wq); @@ -96,9 +92,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_size = i * PAGE_SIZE; + bio->bi_iter.bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; - bio->bi_sector = ofs >> 9; + bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = writeseg_end_io; atomic_inc(&super->s_pending_writes); @@ -123,9 +119,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unlock_page(page); } bio->bi_vcnt = nr_pages; - bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_iter.bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; - bio->bi_sector = ofs >> 9; + bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = writeseg_end_io; atomic_inc(&super->s_pending_writes); @@ -188,9 +184,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_size = i * PAGE_SIZE; + bio->bi_iter.bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; - bio->bi_sector = ofs >> 9; + bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = erase_end_io; atomic_inc(&super->s_pending_writes); @@ -209,9 +205,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_io_vec[i].bv_offset = 0; } bio->bi_vcnt = nr_pages; - bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_iter.bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; - bio->bi_sector = ofs >> 9; + bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = erase_end_io; atomic_inc(&super->s_pending_writes); diff --git a/fs/mpage.c b/fs/mpage.c index 0face1c4d4c6..4979ffa60aaa 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -43,16 +43,14 @@ */ static void mpage_end_io(struct bio *bio, int err) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bv; + int i; - do { - struct page *page = bvec->bv_page; + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); if (bio_data_dir(bio) == READ) { - if (uptodate) { + if (!err) { SetPageUptodate(page); } else { ClearPageUptodate(page); @@ -60,14 +58,15 @@ static void mpage_end_io(struct bio *bio, int err) } unlock_page(page); } else { /* bio_data_dir(bio) == WRITE */ - if (!uptodate) { + if (err) { SetPageError(page); if (page->mapping) set_bit(AS_EIO, &page->mapping->flags); } end_page_writeback(page); } - } while (bvec >= bio->bi_io_vec); + } + bio_put(bio); } @@ -94,7 +93,7 @@ mpage_alloc(struct block_device *bdev, if (bio) { bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + bio->bi_iter.bi_sector = first_sector; } return bio; } diff --git a/fs/namei.c b/fs/namei.c index bcb838e2e52f..385f7817bfcc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -196,6 +196,7 @@ recopy: goto error; result->uptr = filename; + result->aname = NULL; audit_getname(result); return result; @@ -209,7 +210,35 @@ getname(const char __user * filename) { return getname_flags(filename, 0, NULL); } -EXPORT_SYMBOL(getname); + +/* + * The "getname_kernel()" interface doesn't do pathnames longer + * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user. + */ +struct filename * +getname_kernel(const char * filename) +{ + struct filename *result; + char *kname; + int len; + + len = strlen(filename); + if (len >= EMBEDDED_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + result = __getname(); + if (unlikely(!result)) + return ERR_PTR(-ENOMEM); + + kname = (char *)result + sizeof(*result); + result->name = kname; + result->uptr = NULL; + result->aname = NULL; + result->separate = false; + + strlcpy(kname, filename, EMBEDDED_NAME_MAX); + return result; +} #ifdef CONFIG_AUDITSYSCALL void putname(struct filename *name) @@ -3927,10 +3956,13 @@ out_dput: done_path_create(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); - if (!error) + if (!error) { + path_put(&old_path); goto retry; + } } if (retry_estale(error, how)) { + path_put(&old_path); how |= LOOKUP_REVAL; goto retry; } diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index e242bbf72972..56ff823ca82e 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -134,8 +134,8 @@ bl_submit_bio(int rw, struct bio *bio) if (bio) { get_parallel(bio->bi_private); dprintk("%s submitting %s bio %u@%llu\n", __func__, - rw == READ ? "read" : "write", - bio->bi_size, (unsigned long long)bio->bi_sector); + rw == READ ? "read" : "write", bio->bi_iter.bi_size, + (unsigned long long)bio->bi_iter.bi_sector); submit_bio(rw, bio); } return NULL; @@ -156,7 +156,8 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, } if (bio) { - bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_iter.bi_sector = isect - be->be_f_offset + + be->be_v_offset; bio->bi_bdev = be->be_mdev; bio->bi_end_io = end_io; bio->bi_private = par; @@ -201,18 +202,14 @@ static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, static void bl_end_io_read(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec; + int i; - do { - struct page *page = bvec->bv_page; + if (!err) + bio_for_each_segment_all(bvec, bio, i) + SetPageUptodate(bvec->bv_page); - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (uptodate) - SetPageUptodate(page); - } while (bvec >= bio->bi_io_vec); - if (!uptodate) { + if (err) { struct nfs_read_data *rdata = par->data; struct nfs_pgio_header *header = rdata->header; @@ -383,20 +380,16 @@ static void mark_extents_written(struct pnfs_block_layout *bl, static void bl_end_io_write_zero(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - - do { - struct page *page = bvec->bv_page; + struct bio_vec *bvec; + int i; - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); + bio_for_each_segment_all(bvec, bio, i) { /* This is the zeroing page we added */ - end_page_writeback(page); - page_cache_release(page); - } while (bvec >= bio->bi_io_vec); + end_page_writeback(bvec->bv_page); + page_cache_release(bvec->bv_page); + } - if (unlikely(!uptodate)) { + if (unlikely(err)) { struct nfs_write_data *data = par->data; struct nfs_pgio_header *header = data->header; @@ -519,7 +512,7 @@ bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be, isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + (offset / SECTOR_SIZE); - bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset; bio->bi_bdev = be->be_mdev; bio->bi_end_io = bl_read_single_end_io; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index ef792f29f831..5d8ccecf5f5c 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -659,16 +659,19 @@ int nfs_async_inode_return_delegation(struct inode *inode, rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation == NULL) + goto out_enoent; - if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) { - rcu_read_unlock(); - return -ENOENT; - } + if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) + goto out_enoent; nfs_mark_return_delegation(server, delegation); rcu_read_unlock(); nfs_delegation_run_state_manager(clp); return 0; +out_enoent: + rcu_read_unlock(); + return -ENOENT; } static struct inode * diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b266f734bd53..4a48fe4b84b6 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -274,6 +274,15 @@ out_eof: return -EBADCOOKIE; } +static bool +nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) +{ + if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) + return false; + smp_rmb(); + return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); +} + static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) { @@ -287,8 +296,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des struct nfs_open_dir_context *ctx = desc->file->private_data; new_pos = desc->current_index + i; - if (ctx->attr_gencount != nfsi->attr_gencount - || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { + if (ctx->attr_gencount != nfsi->attr_gencount || + !nfs_readdir_inode_mapping_valid(nfsi)) { ctx->duped = 0; ctx->attr_gencount = nfsi->attr_gencount; } else if (new_pos < desc->ctx->pos) { @@ -1837,6 +1846,11 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) GFP_KERNEL)) { SetPageUptodate(page); unlock_page(page); + /* + * add_to_page_cache_lru() grabs an extra page refcount. + * Drop it here to avoid leaking this page later. + */ + page_cache_release(page); } else __free_page(page); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index ea00b34ff071..360114ae8b82 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -164,17 +164,16 @@ static void nfs_zap_caches_locked(struct inode *inode) if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { nfs_fscache_invalidate(inode); nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_LABEL | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_PAGECACHE; } else nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_LABEL | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_PAGECACHE; + nfs_zap_label_cache_locked(nfsi); } void nfs_zap_caches(struct inode *inode) @@ -266,6 +265,13 @@ nfs_init_locked(struct inode *inode, void *opaque) } #ifdef CONFIG_NFS_V4_SECURITY_LABEL +static void nfs_clear_label_invalid(struct inode *inode) +{ + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL; + spin_unlock(&inode->i_lock); +} + void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, struct nfs4_label *label) { @@ -283,6 +289,7 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, __func__, (char *)label->label, label->len, error); + nfs_clear_label_invalid(inode); } } @@ -977,11 +984,11 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (ret < 0) return ret; } - spin_lock(&inode->i_lock); - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode)) { + spin_lock(&inode->i_lock); memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); - spin_unlock(&inode->i_lock); + spin_unlock(&inode->i_lock); + } nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); nfs_fscache_wait_on_invalidate(inode); @@ -1008,6 +1015,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode) int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); + unsigned long *bitlock = &nfsi->flags; int ret = 0; /* swapfiles are not supposed to be shared. */ @@ -1019,12 +1027,46 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) if (ret < 0) goto out; } - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { - trace_nfs_invalidate_mapping_enter(inode); - ret = nfs_invalidate_mapping(inode, mapping); - trace_nfs_invalidate_mapping_exit(inode, ret); + + /* + * We must clear NFS_INO_INVALID_DATA first to ensure that + * invalidations that come in while we're shooting down the mappings + * are respected. But, that leaves a race window where one revalidator + * can clear the flag, and then another checks it before the mapping + * gets invalidated. Fix that by serializing access to this part of + * the function. + * + * At the same time, we need to allow other tasks to see whether we + * might be in the middle of invalidating the pages, so we only set + * the bit lock here if it looks like we're going to be doing that. + */ + for (;;) { + ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) + goto out; + spin_lock(&inode->i_lock); + if (test_bit(NFS_INO_INVALIDATING, bitlock)) { + spin_unlock(&inode->i_lock); + continue; + } + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + break; + spin_unlock(&inode->i_lock); + goto out; } + set_bit(NFS_INO_INVALIDATING, bitlock); + smp_wmb(); + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + spin_unlock(&inode->i_lock); + trace_nfs_invalidate_mapping_enter(inode); + ret = nfs_invalidate_mapping(inode, mapping); + trace_nfs_invalidate_mapping_exit(inode, ret); + + clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_INVALIDATING); out: return ret; } @@ -1613,7 +1655,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_blocks = fattr->du.nfs2.blocks; /* Update attrtimeo value if we're out of the unstable period */ - if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) { + if (invalid & NFS_INO_INVALID_ATTR) { nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; @@ -1626,7 +1668,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } } invalid &= ~NFS_INO_INVALID_ATTR; - invalid &= ~NFS_INO_INVALID_LABEL; /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 8b5cc04a8611..b46cf5a67329 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -176,7 +176,8 @@ extern struct nfs_server *nfs4_create_server( extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, struct nfs_fh *); extern int nfs4_update_server(struct nfs_server *server, const char *hostname, - struct sockaddr *sap, size_t salen); + struct sockaddr *sap, size_t salen, + struct net *net); extern void nfs_free_server(struct nfs_server *server); extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fh *, @@ -279,9 +280,18 @@ static inline void nfs4_label_free(struct nfs4_label *label) } return; } + +static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) +{ + if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL)) + nfsi->cache_validity |= NFS_INO_INVALID_LABEL; +} #else static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } static inline void nfs4_label_free(void *label) {} +static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) +{ +} #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ /* proc.c */ diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 9a5ca03fa539..871d6eda8dba 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -80,7 +80,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) } if (res.acl_access != NULL) { - if (posix_acl_equiv_mode(res.acl_access, NULL) || + if ((posix_acl_equiv_mode(res.acl_access, NULL) == 0) || res.acl_access->a_count == 0) { posix_acl_release(res.acl_access); res.acl_access = NULL; @@ -113,7 +113,7 @@ getout: return ERR_PTR(status); } -int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, +static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl) { struct nfs_server *server = NFS_SERVER(inode); @@ -198,6 +198,15 @@ out: return status; } +int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + int ret; + ret = __nfs3_proc_setacls(inode, acl, dfacl); + return (ret == -EOPNOTSUPP) ? 0 : ret; + +} + int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) { struct posix_acl *alloc = NULL, *dfacl = NULL; @@ -225,7 +234,7 @@ int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (IS_ERR(alloc)) goto fail; } - status = nfs3_proc_setacls(inode, acl, dfacl); + status = __nfs3_proc_setacls(inode, acl, dfacl); posix_acl_release(alloc); return status; @@ -233,25 +242,6 @@ fail: return PTR_ERR(alloc); } -int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, - umode_t mode) -{ - struct posix_acl *default_acl, *acl; - int error; - - error = posix_acl_create(dir, &mode, &default_acl, &acl); - if (error) - return (error == -EOPNOTSUPP) ? 0 : error; - - error = nfs3_proc_setacls(inode, acl, default_acl); - - if (acl) - posix_acl_release(acl); - if (default_acl) - posix_acl_release(default_acl); - return error; -} - const struct xattr_handler *nfs3_xattr_handlers[] = { &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d2255d705421..a462ef0fb5d6 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -18,6 +18,7 @@ #include <linux/lockd/bind.h> #include <linux/nfs_mount.h> #include <linux/freezer.h> +#include <linux/xattr.h> #include "iostat.h" #include "internal.h" @@ -924,11 +925,11 @@ static const struct inode_operations nfs3_dir_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, +#ifdef CONFIG_NFS_V3_ACL .listxattr = generic_listxattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, .removexattr = generic_removexattr, -#ifdef CONFIG_NFS_V3_ACL .get_acl = nfs3_get_acl, .set_acl = nfs3_set_acl, #endif @@ -938,11 +939,11 @@ static const struct inode_operations nfs3_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, +#ifdef CONFIG_NFS_V3_ACL .listxattr = generic_listxattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, .removexattr = generic_removexattr, -#ifdef CONFIG_NFS_V3_ACL .get_acl = nfs3_get_acl, .set_acl = nfs3_set_acl, #endif diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 5609edc742a0..a5b27c2d9689 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -270,6 +270,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser extern int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task); +extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *); extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *); extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 73d4ecda1e36..0e46d3d1b6cc 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -170,7 +170,7 @@ void nfs41_shutdown_client(struct nfs_client *clp) void nfs40_shutdown_client(struct nfs_client *clp) { if (clp->cl_slot_tbl) { - nfs4_release_slot_table(clp->cl_slot_tbl); + nfs4_shutdown_slot_table(clp->cl_slot_tbl); kfree(clp->cl_slot_tbl); } } @@ -372,10 +372,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - error = -EINVAL; - if (gssd_running(clp->cl_net)) - error = nfs_create_rpc_client(clp, timeparms, - RPC_AUTH_GSS_KRB5I); + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); if (error < 0) @@ -1138,6 +1135,7 @@ static int nfs_probe_destination(struct nfs_server *server) * @hostname: new end-point's hostname * @sap: new end-point's socket address * @salen: size of "sap" + * @net: net namespace * * The nfs_server must be quiescent before this function is invoked. * Either its session is drained (NFSv4.1+), or its transport is @@ -1146,13 +1144,13 @@ static int nfs_probe_destination(struct nfs_server *server) * Returns zero on success, or a negative errno value. */ int nfs4_update_server(struct nfs_server *server, const char *hostname, - struct sockaddr *sap, size_t salen) + struct sockaddr *sap, size_t salen, struct net *net) { struct nfs_client *clp = server->nfs_client; struct rpc_clnt *clnt = server->client; struct xprt_create xargs = { .ident = clp->cl_proto, - .net = &init_net, + .net = net, .dstaddr = sap, .addrlen = salen, .servername = hostname, @@ -1192,7 +1190,7 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, error = nfs4_set_client(server, hostname, sap, salen, buf, clp->cl_rpcclient->cl_auth->au_flavor, clp->cl_proto, clnt->cl_timeout, - clp->cl_minorversion, clp->cl_net); + clp->cl_minorversion, net); nfs_put_client(clp); if (error != 0) { nfs_server_insert_lists(server); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 03fd8be8c0c5..b9a35c05b60f 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -324,8 +324,9 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) &rdata->res.seq_res, task)) return; - nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, - rdata->args.lock_context, FMODE_READ); + if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, + rdata->args.lock_context, FMODE_READ) == -EIO) + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ } static void filelayout_read_call_done(struct rpc_task *task, void *data) @@ -335,8 +336,10 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data) dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) && - task->tk_status == 0) + task->tk_status == 0) { + nfs41_sequence_done(task, &rdata->res.seq_res); return; + } /* Note this may cause RPC to be resent */ rdata->header->mds_ops->rpc_call_done(task, data); @@ -433,8 +436,9 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data) &wdata->res.seq_res, task)) return; - nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, - wdata->args.lock_context, FMODE_WRITE); + if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, + wdata->args.lock_context, FMODE_WRITE) == -EIO) + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ } static void filelayout_write_call_done(struct rpc_task *task, void *data) @@ -442,8 +446,10 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data) struct nfs_write_data *wdata = data; if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && - task->tk_status == 0) + task->tk_status == 0) { + nfs41_sequence_done(task, &wdata->res.seq_res); return; + } /* Note this may cause RPC to be resent */ wdata->header->mds_ops->rpc_call_done(task, data); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 4e7f05d3e9db..3d5dbf80d46a 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -121,9 +121,8 @@ static int nfs4_validate_fspath(struct dentry *dentry, } static size_t nfs_parse_server_name(char *string, size_t len, - struct sockaddr *sa, size_t salen, struct nfs_server *server) + struct sockaddr *sa, size_t salen, struct net *net) { - struct net *net = rpc_net_ns(server->client); ssize_t ret; ret = rpc_pton(net, string, len, sa, salen); @@ -223,6 +222,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, const struct nfs4_fs_location *location) { const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct net *net = rpc_net_ns(NFS_SB(mountdata->sb)->client); struct vfsmount *mnt = ERR_PTR(-ENOENT); char *mnt_path; unsigned int maxbuflen; @@ -248,8 +248,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, continue; mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, - mountdata->addr, addr_bufsize, - NFS_SB(mountdata->sb)); + mountdata->addr, addr_bufsize, net); if (mountdata->addrlen == 0) continue; @@ -419,6 +418,7 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, const struct nfs4_fs_location *location) { const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct net *net = rpc_net_ns(server->client); struct sockaddr *sap; unsigned int s; size_t salen; @@ -440,7 +440,7 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, continue; salen = nfs_parse_server_name(buf->data, buf->len, - sap, addr_bufsize, server); + sap, addr_bufsize, net); if (salen == 0) continue; rpc_set_port(sap, NFS_PORT); @@ -450,7 +450,7 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, if (hostname == NULL) break; - error = nfs4_update_server(server, hostname, sap, salen); + error = nfs4_update_server(server, hostname, sap, salen, net); kfree(hostname); if (error == 0) break; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a1965329a12c..450bfedbe2f4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -539,7 +539,7 @@ static int nfs40_sequence_done(struct rpc_task *task, struct nfs4_slot *slot = res->sr_slot; struct nfs4_slot_table *tbl; - if (!RPC_WAS_SENT(task)) + if (slot == NULL) goto out; tbl = slot->table; @@ -559,15 +559,10 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_session *session; struct nfs4_slot_table *tbl; + struct nfs4_slot *slot = res->sr_slot; bool send_new_highest_used_slotid = false; - if (!res->sr_slot) { - /* just wake up the next guy waiting since - * we may have not consumed a slot after all */ - dprintk("%s: No slot\n", __func__); - return; - } - tbl = res->sr_slot->table; + tbl = slot->table; session = tbl->session; spin_lock(&tbl->slot_tbl_lock); @@ -577,11 +572,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) if (tbl->highest_used_slotid > tbl->target_highest_slotid) send_new_highest_used_slotid = true; - if (nfs41_wake_and_assign_slot(tbl, res->sr_slot)) { + if (nfs41_wake_and_assign_slot(tbl, slot)) { send_new_highest_used_slotid = false; goto out_unlock; } - nfs4_free_slot(tbl, res->sr_slot); + nfs4_free_slot(tbl, slot); if (tbl->highest_used_slotid != NFS4_NO_SLOT) send_new_highest_used_slotid = false; @@ -592,19 +587,20 @@ out_unlock: nfs41_server_notify_highest_slotid_update(session->clp); } -static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { struct nfs4_session *session; - struct nfs4_slot *slot; + struct nfs4_slot *slot = res->sr_slot; struct nfs_client *clp; bool interrupted = false; int ret = 1; + if (slot == NULL) + goto out_noaction; /* don't increment the sequence number if the task wasn't sent */ if (!RPC_WAS_SENT(task)) goto out; - slot = res->sr_slot; session = slot->table->session; if (slot->interrupted) { @@ -679,6 +675,7 @@ out: /* The session may be reset by one of the error handlers. */ dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); nfs41_sequence_free_slot(res); +out_noaction: return ret; retry_nowait: if (rpc_restart_call_prepare(task)) { @@ -692,6 +689,7 @@ out_retry: rpc_delay(task, NFS4_POLL_RETRY_MAX); return 0; } +EXPORT_SYMBOL_GPL(nfs41_sequence_done); static int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) @@ -1622,15 +1620,15 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; - nfs40_setup_sequence(data->o_arg.server, &data->o_arg.seq_args, - &data->o_res.seq_res, task); + nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args, + &data->c_res.seq_res, task); } static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; - nfs40_sequence_done(task, &data->o_res.seq_res); + nfs40_sequence_done(task, &data->c_res.seq_res); data->rpc_status = task->tk_status; if (data->rpc_status == 0) { @@ -1688,7 +1686,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) }; int status; - nfs4_init_sequence(&data->o_arg.seq_args, &data->o_res.seq_res, 1); + nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1); kref_get(&data->kref); data->rpc_done = 0; data->rpc_status = 0; @@ -2400,13 +2398,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) { /* Use that stateid */ - } else if (truncate && state != NULL && nfs4_valid_open_stateid(state)) { + } else if (truncate && state != NULL) { struct nfs_lockowner lockowner = { .l_owner = current->files, .l_pid = current->tgid, }; - nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, - &lockowner); + if (!nfs4_valid_open_stateid(state)) + return -EBADF; + if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, + &lockowner) == -EIO) + return -EBADF; } else nfs4_stateid_copy(&arg.stateid, &zero_stateid); @@ -2744,7 +2745,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| NFS_CAP_CTIME|NFS_CAP_MTIME| NFS_CAP_SECURITY_LABEL); - if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) + if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && + res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) server->caps |= NFS_CAP_ACLS; if (res.has_links != 0) server->caps |= NFS_CAP_HARDLINKS; @@ -4012,8 +4014,9 @@ static bool nfs4_stateid_is_current(nfs4_stateid *stateid, { nfs4_stateid current_stateid; - if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode)) - return false; + /* If the current stateid represents a lost lock, then exit */ + if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode) == -EIO) + return true; return nfs4_stateid_match(stateid, ¤t_stateid); } @@ -4321,9 +4324,7 @@ static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) static inline int nfs4_server_supports_acls(struct nfs_server *server) { - return (server->caps & NFS_CAP_ACLS) - && (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) - && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL); + return server->caps & NFS_CAP_ACLS; } /* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that @@ -5831,8 +5832,7 @@ struct nfs_release_lockowner_data { struct nfs4_lock_state *lsp; struct nfs_server *server; struct nfs_release_lockowner_args args; - struct nfs4_sequence_args seq_args; - struct nfs4_sequence_res seq_res; + struct nfs_release_lockowner_res res; unsigned long timestamp; }; @@ -5840,7 +5840,7 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata { struct nfs_release_lockowner_data *data = calldata; nfs40_setup_sequence(data->server, - &data->seq_args, &data->seq_res, task); + &data->args.seq_args, &data->res.seq_res, task); data->timestamp = jiffies; } @@ -5849,7 +5849,7 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata) struct nfs_release_lockowner_data *data = calldata; struct nfs_server *server = data->server; - nfs40_sequence_done(task, &data->seq_res); + nfs40_sequence_done(task, &data->res.seq_res); switch (task->tk_status) { case 0: @@ -5890,7 +5890,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st data = kmalloc(sizeof(*data), GFP_NOFS); if (!data) return -ENOMEM; - nfs4_init_sequence(&data->seq_args, &data->seq_res, 0); data->lsp = lsp; data->server = server; data->args.lock_owner.clientid = server->nfs_client->cl_clientid; @@ -5898,6 +5897,8 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st data->args.lock_owner.s_dev = server->s_dev; msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); return 0; } diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index cf883c7ae053..e799dc3c3b1d 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -231,14 +231,23 @@ out: return ret; } +/* + * nfs4_release_slot_table - release all slot table entries + */ +static void nfs4_release_slot_table(struct nfs4_slot_table *tbl) +{ + nfs4_shrink_slot_table(tbl, 0); +} + /** - * nfs4_release_slot_table - release resources attached to a slot table + * nfs4_shutdown_slot_table - release resources attached to a slot table * @tbl: slot table to shut down * */ -void nfs4_release_slot_table(struct nfs4_slot_table *tbl) +void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl) { - nfs4_shrink_slot_table(tbl, 0); + nfs4_release_slot_table(tbl); + rpc_destroy_wait_queue(&tbl->slot_tbl_waitq); } /** @@ -422,7 +431,7 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, spin_unlock(&tbl->slot_tbl_lock); } -static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) +static void nfs4_release_session_slot_tables(struct nfs4_session *session) { nfs4_release_slot_table(&session->fc_slot_table); nfs4_release_slot_table(&session->bc_slot_table); @@ -450,7 +459,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses) if (status && tbl->slots == NULL) /* Fore and back channel share a connection so get * both slot tables or neither */ - nfs4_destroy_session_slot_tables(ses); + nfs4_release_session_slot_tables(ses); return status; } @@ -470,6 +479,12 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) return session; } +static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) +{ + nfs4_shutdown_slot_table(&session->fc_slot_table); + nfs4_shutdown_slot_table(&session->bc_slot_table); +} + void nfs4_destroy_session(struct nfs4_session *session) { struct rpc_xprt *xprt; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 232306100651..b34ada9bc6a2 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -74,7 +74,7 @@ enum nfs4_session_state { extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, unsigned int max_reqs, const char *queue); -extern void nfs4_release_slot_table(struct nfs4_slot_table *tbl); +extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl); extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e5be72518bd7..0deb32105ccf 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -974,9 +974,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { nfs4_stateid_copy(dst, &lsp->ls_stateid); ret = 0; - smp_rmb(); - if (!list_empty(&lsp->ls_seqid.list)) - ret = -EWOULDBLOCK; } spin_unlock(&state->state_lock); nfs4_put_lock_state(lsp); @@ -984,10 +981,9 @@ out: return ret; } -static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) { const nfs4_stateid *src; - int ret; int seq; do { @@ -996,12 +992,7 @@ static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) if (test_bit(NFS_OPEN_STATE, &state->flags)) src = &state->open_stateid; nfs4_stateid_copy(dst, src); - ret = 0; - smp_rmb(); - if (!list_empty(&state->owner->so_seqid.list)) - ret = -EWOULDBLOCK; } while (read_seqretry(&state->seqlock, seq)); - return ret; } /* @@ -1015,15 +1006,19 @@ int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, if (ret == -EIO) /* A lost lock - don't even consider delegations */ goto out; - if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) + /* returns true if delegation stateid found and copied */ + if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) { + ret = 0; goto out; + } if (ret != -ENOENT) /* nfs4_copy_delegation_stateid() didn't over-write * dst, so it still has the lock stateid which we now * choose to use. */ goto out; - ret = nfs4_copy_open_stateid(dst, state); + nfs4_copy_open_stateid(dst, state); + ret = 0; out: if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41)) dst->seqid = 0; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 8c21d69a9dc1..72f3bf1754ef 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3449,7 +3449,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint { __be32 *p; - *res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL; + *res = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 89fe741e58b1..59f838cdc009 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -36,6 +36,7 @@ __print_flags(v, "|", \ { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ { 1 << NFS_INO_STALE, "STALE" }, \ + { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ { 1 << NFS_INO_COMMIT, "COMMIT" }, \ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index a44a87268a6e..9a3b6a4cd6b9 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -909,9 +909,14 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) */ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) { + struct nfs_inode *nfsi = NFS_I(inode); + if (nfs_have_delegated_attributes(inode)) goto out; - if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) + if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) + return false; + smp_rmb(); + if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) return false; out: return PageUptodate(page) != 0; diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index 8b68218e2c1c..a812fd1b92a4 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -45,7 +45,7 @@ struct svc_rqst; struct nfs4_acl *nfs4_acl_new(int); int nfs4_acl_get_whotype(char *, u32); -int nfs4_acl_write_who(int who, char *p); +__be32 nfs4_acl_write_who(int who, __be32 **p, int *len); int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl); diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index d5c5b3e00266..b582f9ab6b2a 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -84,12 +84,4 @@ int nfsd_cache_lookup(struct svc_rqst *); void nfsd_cache_update(struct svc_rqst *, int, __be32 *); int nfsd_reply_cache_stats_open(struct inode *, struct file *); -#ifdef CONFIG_NFSD_V4 -void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp); -#else /* CONFIG_NFSD_V4 */ -static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp) -{ -} -#endif /* CONFIG_NFSD_V4 */ - #endif /* NFSCACHE_H */ diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h index bf95f6b817a4..66e58db01936 100644 --- a/fs/nfsd/idmap.h +++ b/fs/nfsd/idmap.h @@ -56,7 +56,7 @@ static inline void nfsd_idmap_shutdown(struct net *net) __be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *); __be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *); -int nfsd_map_uid_to_name(struct svc_rqst *, kuid_t, char *); -int nfsd_map_gid_to_name(struct svc_rqst *, kgid_t, char *); +__be32 nfsd4_encode_user(struct svc_rqst *, kuid_t, __be32 **, int *); +__be32 nfsd4_encode_group(struct svc_rqst *, kgid_t, __be32 **, int *); #endif /* LINUX_NFSD_IDMAP_H */ diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 849a7c3ced22..d32b3aa6600d 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -95,6 +95,7 @@ struct nfsd_net { time_t nfsd4_grace; bool nfsd_net_up; + bool lockd_up; /* * Time of server startup diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 14d9ecb96cff..de6e39e12cb3 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -168,7 +168,7 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) { *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); - *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) (stat->mode & S_IALLUGO)); *p++ = htonl((u32) stat->nlink); *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); @@ -842,21 +842,21 @@ out: static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) { - struct svc_fh fh; + struct svc_fh *fh = &cd->scratch; __be32 err; - fh_init(&fh, NFS3_FHSIZE); - err = compose_entry_fh(cd, &fh, name, namlen); + fh_init(fh, NFS3_FHSIZE); + err = compose_entry_fh(cd, fh, name, namlen); if (err) { *p++ = 0; *p++ = 0; goto out; } - p = encode_post_op_attr(cd->rqstp, p, &fh); + p = encode_post_op_attr(cd->rqstp, p, fh); *p++ = xdr_one; /* yes, a file handle follows */ - p = encode_fh(p, &fh); + p = encode_fh(p, fh); out: - fh_put(&fh); + fh_put(fh); return p; } diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 649ad7cf2204..d190e33d0ec2 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -38,6 +38,7 @@ #include <linux/nfs_fs.h> #include <linux/export.h> #include "nfsfh.h" +#include "nfsd.h" #include "acl.h" #include "vfs.h" @@ -150,17 +151,15 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); if (IS_ERR(pacl)) return PTR_ERR(pacl); - /* allocate for worst case: one (deny, allow) pair each: */ - size += 2 * pacl->a_count; } + /* allocate for worst case: one (deny, allow) pair each: */ + size += 2 * pacl->a_count; if (S_ISDIR(inode->i_mode)) { flags = NFS4_ACL_DIR; dpacl = get_acl(inode, ACL_TYPE_DEFAULT); if (dpacl) size += 2 * dpacl->a_count; - } else { - dpacl = NULL; } *acl = nfs4_acl_new(size); @@ -169,8 +168,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, goto out; } - if (pacl) - _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT); + _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT); if (dpacl) _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT); @@ -916,17 +914,22 @@ nfs4_acl_get_whotype(char *p, u32 len) return NFS4_ACL_WHO_NAMED; } -int -nfs4_acl_write_who(int who, char *p) +__be32 nfs4_acl_write_who(int who, __be32 **p, int *len) { int i; + int bytes; for (i = 0; i < ARRAY_SIZE(s2t_map); i++) { - if (s2t_map[i].type == who) { - memcpy(p, s2t_map[i].string, s2t_map[i].stringlen); - return s2t_map[i].stringlen; - } + if (s2t_map[i].type != who) + continue; + bytes = 4 + (XDR_QUADLEN(s2t_map[i].stringlen) << 2); + if (bytes > *len) + return nfserr_resource; + *p = xdr_encode_opaque(*p, s2t_map[i].string, + s2t_map[i].stringlen); + *len -= bytes; + return 0; } - BUG(); + WARN_ON_ONCE(1); return -1; } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 4832fd819f88..c0dfde68742e 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -551,27 +551,46 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen return 0; } -static int -idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name) +static __be32 encode_ascii_id(u32 id, __be32 **p, int *buflen) +{ + char buf[11]; + int len; + int bytes; + + len = sprintf(buf, "%u", id); + bytes = 4 + (XDR_QUADLEN(len) << 2); + if (bytes > *buflen) + return nfserr_resource; + *p = xdr_encode_opaque(*p, buf, len); + *buflen -= bytes; + return 0; +} + +static __be32 idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen) { struct ent *item, key = { .id = id, .type = type, }; int ret; + int bytes; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); if (ret == -ENOENT) - return sprintf(name, "%u", id); + return encode_ascii_id(id, p, buflen); if (ret) - return ret; + return nfserrno(ret); ret = strlen(item->name); - BUG_ON(ret > IDMAP_NAMESZ); - memcpy(name, item->name, ret); + WARN_ON_ONCE(ret > IDMAP_NAMESZ); + bytes = 4 + (XDR_QUADLEN(ret) << 2); + if (bytes > *buflen) + return nfserr_resource; + *p = xdr_encode_opaque(*p, item->name, ret); + *buflen -= bytes; cache_put(&item->h, nn->idtoname_cache); - return ret; + return 0; } static bool @@ -603,12 +622,11 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u return idmap_name_to_id(rqstp, type, name, namelen, id); } -static int -do_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name) +static __be32 encode_name_from_id(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) - return sprintf(name, "%u", id); - return idmap_id_to_name(rqstp, type, id, name); + return encode_ascii_id(id, p, buflen); + return idmap_id_to_name(rqstp, type, id, p, buflen); } __be32 @@ -637,16 +655,14 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, return status; } -int -nfsd_map_uid_to_name(struct svc_rqst *rqstp, kuid_t uid, char *name) +__be32 nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t uid, __be32 **p, int *buflen) { u32 id = from_kuid(&init_user_ns, uid); - return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name); + return encode_name_from_id(rqstp, IDMAP_TYPE_USER, id, p, buflen); } -int -nfsd_map_gid_to_name(struct svc_rqst *rqstp, kgid_t gid, char *name) +__be32 nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t gid, __be32 **p, int *buflen) { u32 id = from_kgid(&init_user_ns, gid); - return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name); + return encode_name_from_id(rqstp, IDMAP_TYPE_GROUP, id, p, buflen); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 825b8a99b99b..82189b208af3 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -231,17 +231,16 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate } static __be32 -do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) +do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) { struct svc_fh *current_fh = &cstate->current_fh; - struct svc_fh *resfh; int accmode; __be32 status; - resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); - if (!resfh) + *resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); + if (!*resfh) return nfserr_jukebox; - fh_init(resfh, NFS4_FHSIZE); + fh_init(*resfh, NFS4_FHSIZE); open->op_truncate = 0; if (open->op_create) { @@ -266,12 +265,12 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru */ status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, open->op_fname.len, &open->op_iattr, - resfh, open->op_createmode, + *resfh, open->op_createmode, (u32 *)open->op_verf.data, &open->op_truncate, &open->op_created); if (!status && open->op_label.len) - nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval); + nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval); /* * Following rfc 3530 14.2.16, use the returned bitmask @@ -281,31 +280,32 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_MODIFY); - } else { + } else + /* + * Note this may exit with the parent still locked. + * We will hold the lock until nfsd4_open's final + * lookup, to prevent renames or unlinks until we've had + * a chance to an acquire a delegation if appropriate. + */ status = nfsd_lookup(rqstp, current_fh, - open->op_fname.data, open->op_fname.len, resfh); - fh_unlock(current_fh); - } + open->op_fname.data, open->op_fname.len, *resfh); if (status) goto out; - status = nfsd_check_obj_isreg(resfh); + status = nfsd_check_obj_isreg(*resfh); if (status) goto out; if (is_create_with_attrs(open) && open->op_acl != NULL) - do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval); + do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval); - nfsd4_set_open_owner_reply_cache(cstate, open, resfh); + nfsd4_set_open_owner_reply_cache(cstate, open, *resfh); accmode = NFSD_MAY_NOP; if (open->op_created || open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) accmode |= NFSD_MAY_OWNER_OVERRIDE; - status = do_open_permission(rqstp, resfh, open, accmode); + status = do_open_permission(rqstp, *resfh, open, accmode); set_change_info(&open->op_cinfo, current_fh); - fh_dup2(current_fh, resfh); out: - fh_put(resfh); - kfree(resfh); return status; } @@ -358,6 +358,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { __be32 status; + struct svc_fh *resfh = NULL; struct nfsd4_compoundres *resp; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -424,7 +425,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: case NFS4_OPEN_CLAIM_NULL: - status = do_open_lookup(rqstp, cstate, open); + status = do_open_lookup(rqstp, cstate, open, &resfh); if (status) goto out; break; @@ -440,6 +441,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = do_open_fhandle(rqstp, cstate, open); if (status) goto out; + resfh = &cstate->current_fh; break; case NFS4_OPEN_CLAIM_DELEG_PREV_FH: case NFS4_OPEN_CLAIM_DELEGATE_PREV: @@ -459,9 +461,14 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * successful, it (1) truncates the file if open->op_truncate was * set, (2) sets open->op_stateid, (3) sets open->op_delegation. */ - status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); + status = nfsd4_process_open2(rqstp, resfh, open); WARN_ON(status && open->op_created); out: + if (resfh && resfh != &cstate->current_fh) { + fh_dup2(&cstate->current_fh, resfh); + fh_put(resfh); + kfree(resfh); + } nfsd4_cleanup_open_state(open, status); if (open->op_openowner && !nfsd4_has_session(cstate)) cstate->replay_owner = &open->op_openowner->oo_owner; @@ -1070,8 +1077,10 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cstate->current_fh.fh_dentry, &p, count, verify->ve_bmval, rqstp, 0); - - /* this means that nfsd4_encode_fattr() ran out of space */ + /* + * If nfsd4_encode_fattr() ran out of space, assume that's because + * the attributes are longer (hence different) than those given: + */ if (status == nfserr_resource) status = nfserr_not_same; if (status) @@ -1525,7 +1534,8 @@ static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ - 1 + 1 + 2 + /* eir_flags, spr_how, spo_must_enforce & _allow */\ + 1 + 1 + /* eir_flags, spr_how */\ + 4 + /* spo_must_enforce & _allow with bitmap */\ 2 + /*eir_server_owner.so_minor_id */\ /* eir_server_owner.so_major_id<> */\ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ @@ -1882,6 +1892,7 @@ struct svc_version nfsd_version4 = { .vs_proc = nfsd_procedures4, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS4_SVC_XDRSIZE, + .vs_rpcb_optnl = 1, }; /* diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 105d6fa7c514..d5d070fbeb35 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -832,10 +832,11 @@ static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca) spin_unlock(&nfsd_drc_lock); } -static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs) +static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, + struct nfsd4_channel_attrs *battrs) { - int numslots = attrs->maxreqs; - int slotsize = slot_bytes(attrs); + int numslots = fattrs->maxreqs; + int slotsize = slot_bytes(fattrs); struct nfsd4_session *new; int mem, i; @@ -852,6 +853,10 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs) if (!new->se_slots[i]) goto out_free; } + + memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs)); + memcpy(&new->se_bchannel, battrs, sizeof(struct nfsd4_channel_attrs)); + return new; out_free: while (i--) @@ -997,8 +1002,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); spin_unlock(&nn->client_lock); - memcpy(&new->se_fchannel, &cses->fore_channel, - sizeof(struct nfsd4_channel_attrs)); + if (cses->flags & SESSION4_BACK_CHAN) { struct sockaddr *sa = svc_addr(rqstp); /* @@ -1851,6 +1855,11 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs return nfs_ok; } +#define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \ + RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32)) +#define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \ + RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32)) + static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) { ca->headerpadsz = 0; @@ -1861,9 +1870,9 @@ static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) * less than 1k. Tighten up this estimate in the unlikely event * it turns out to be a problem for some client: */ - if (ca->maxreq_sz < NFS4_enc_cb_recall_sz + RPC_MAX_HEADER_WITH_AUTH) + if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ) return nfserr_toosmall; - if (ca->maxresp_sz < NFS4_dec_cb_recall_sz + RPC_MAX_REPHEADER_WITH_AUTH) + if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ) return nfserr_toosmall; ca->maxresp_cached = 0; if (ca->maxops < 2) @@ -1913,9 +1922,9 @@ nfsd4_create_session(struct svc_rqst *rqstp, return status; status = check_backchannel_attrs(&cr_ses->back_channel); if (status) - return status; + goto out_release_drc_mem; status = nfserr_jukebox; - new = alloc_session(&cr_ses->fore_channel); + new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel); if (!new) goto out_release_drc_mem; conn = alloc_conn_from_crses(rqstp, cr_ses); @@ -3034,18 +3043,18 @@ static int nfs4_setlease(struct nfs4_delegation *dp) if (!fl) return -ENOMEM; fl->fl_file = find_readable_file(fp); - list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); - if (status) { - list_del_init(&dp->dl_perclnt); - locks_free_lock(fl); - return status; - } + if (status) + goto out_free; + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); fp->fi_lease = fl; fp->fi_deleg_file = get_file(fl->fl_file); atomic_set(&fp->fi_delegees, 1); list_add(&dp->dl_perfile, &fp->fi_delegations); return 0; +out_free: + locks_free_lock(fl); + return status; } static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp) @@ -3125,6 +3134,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh, goto out_no_deleg; break; case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: /* * Let's not give out any delegations till everyone's * had the chance to reclaim theirs.... diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index ee7237f99f54..63f2395c57ed 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -103,11 +103,6 @@ xdr_error: \ (x) = (u64)ntohl(*p++) << 32; \ (x) |= ntohl(*p++); \ } while (0) -#define READTIME(x) do { \ - p++; \ - (x) = ntohl(*p++); \ - p++; \ -} while (0) #define READMEM(x,nbytes) do { \ x = (char *)p; \ p += XDR_QUADLEN(nbytes); \ @@ -190,6 +185,15 @@ static int zero_clientid(clientid_t *clid) return (clid->cl_boot == 0) && (clid->cl_id == 0); } +/** + * defer_free - mark an allocation as deferred freed + * @argp: NFSv4 compound argument structure to be freed with + * @release: release callback to free @p, typically kfree() + * @p: pointer to be freed + * + * Marks @p to be freed when processing the compound operation + * described in @argp finishes. + */ static int defer_free(struct nfsd4_compoundargs *argp, void (*release)(const void *), void *p) @@ -206,6 +210,16 @@ defer_free(struct nfsd4_compoundargs *argp, return 0; } +/** + * savemem - duplicate a chunk of memory for later processing + * @argp: NFSv4 compound argument structure to be freed with + * @p: pointer to be duplicated + * @nbytes: length to be duplicated + * + * Returns a pointer to a copy of @nbytes bytes of memory at @p + * that are preserved until processing of the NFSv4 compound + * operation described by @argp finishes. + */ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) { if (p == argp->tmp) { @@ -257,7 +271,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, int expected_len, len = 0; u32 dummy32; char *buf; - int host_err; DECODE_HEAD; iattr->ia_valid = 0; @@ -284,10 +297,9 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return nfserr_resource; *acl = nfs4_acl_new(nace); - if (*acl == NULL) { - host_err = -ENOMEM; - goto out_nfserr; - } + if (*acl == NULL) + return nfserr_jukebox; + defer_free(argp, kfree, *acl); (*acl)->naces = nace; @@ -425,10 +437,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, goto xdr_error; DECODE_TAIL; - -out_nfserr: - status = nfserrno(host_err); - goto out; } static __be32 @@ -1957,56 +1965,16 @@ static u32 nfs4_file_type(umode_t mode) }; } -static __be32 -nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, kuid_t uid, kgid_t gid, - __be32 **p, int *buflen) -{ - int status; - - if (*buflen < (XDR_QUADLEN(IDMAP_NAMESZ) << 2) + 4) - return nfserr_resource; - if (whotype != NFS4_ACL_WHO_NAMED) - status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1)); - else if (gid_valid(gid)) - status = nfsd_map_gid_to_name(rqstp, gid, (u8 *)(*p + 1)); - else - status = nfsd_map_uid_to_name(rqstp, uid, (u8 *)(*p + 1)); - if (status < 0) - return nfserrno(status); - *p = xdr_encode_opaque(*p, NULL, status); - *buflen -= (XDR_QUADLEN(status) << 2) + 4; - BUG_ON(*buflen < 0); - return 0; -} - -static inline __be32 -nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t user, __be32 **p, int *buflen) -{ - return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, user, INVALID_GID, - p, buflen); -} - -static inline __be32 -nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t group, __be32 **p, int *buflen) -{ - return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, INVALID_UID, group, - p, buflen); -} - static inline __be32 nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace, __be32 **p, int *buflen) { - kuid_t uid = INVALID_UID; - kgid_t gid = INVALID_GID; - - if (ace->whotype == NFS4_ACL_WHO_NAMED) { - if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) - gid = ace->who_gid; - else - uid = ace->who_uid; - } - return nfsd4_encode_name(rqstp, ace->whotype, uid, gid, p, buflen); + if (ace->whotype != NFS4_ACL_WHO_NAMED) + return nfs4_acl_write_who(ace->whotype, p, buflen); + else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + return nfsd4_encode_group(rqstp, ace->who_gid, p, buflen); + else + return nfsd4_encode_user(rqstp, ace->who_uid, p, buflen); } #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ @@ -2090,7 +2058,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, u32 bmval1 = bmval[1]; u32 bmval2 = bmval[2]; struct kstat stat; - struct svc_fh tempfh; + struct svc_fh *tempfh = NULL; struct kstatfs statfs; int buflen = count << 2; __be32 *attrlenp; @@ -2137,11 +2105,15 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, goto out_nfserr; } if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { - fh_init(&tempfh, NFS4_FHSIZE); - status = fh_compose(&tempfh, exp, dentry, NULL); + tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); + status = nfserr_jukebox; + if (!tempfh) + goto out; + fh_init(tempfh, NFS4_FHSIZE); + status = fh_compose(tempfh, exp, dentry, NULL); if (status) goto out; - fhp = &tempfh; + fhp = tempfh; } if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_SUPPORTED_ATTRS)) { @@ -2222,8 +2194,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if ((buflen -= 4) < 0) goto out_resource; dummy = nfs4_file_type(stat.mode); - if (dummy == NF4BAD) - goto out_serverfault; + if (dummy == NF4BAD) { + status = nfserr_serverfault; + goto out; + } WRITE32(dummy); } if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { @@ -2317,8 +2291,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, WRITE32(ace->flag); WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL); status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen); - if (status == nfserr_resource) - goto out_resource; if (status) goto out; } @@ -2379,8 +2351,6 @@ out_acl: } if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen); - if (status == nfserr_resource) - goto out_resource; if (status) goto out; } @@ -2431,15 +2401,11 @@ out_acl: } if (bmval1 & FATTR4_WORD1_OWNER) { status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen); - if (status == nfserr_resource) - goto out_resource; if (status) goto out; } if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen); - if (status == nfserr_resource) - goto out_resource; if (status) goto out; } @@ -2533,8 +2499,8 @@ out: security_release_secctx(context, contextlen); #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ kfree(acl); - if (fhp == &tempfh) - fh_put(&tempfh); + if (tempfh) + fh_put(tempfh); return status; out_nfserr: status = nfserrno(err); @@ -2542,9 +2508,6 @@ out_nfserr: out_resource: status = nfserr_resource; goto out; -out_serverfault: - status = nfserr_serverfault; - goto out; } static inline int attributes_need_mount(u32 *bmval) @@ -2621,17 +2584,14 @@ out_put: static __be32 * nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr) { - __be32 *attrlenp; - if (buflen < 6) return NULL; *p++ = htonl(2); *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */ *p++ = htonl(0); /* bmval1 */ - attrlenp = p++; + *p++ = htonl(4); /* attribute length */ *p++ = nfserr; /* no htonl */ - *attrlenp = htonl((char *)p - (char *)attrlenp - 4); return p; } @@ -3244,7 +3204,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, if (rpcauth_get_gssinfo(pf, &info) == 0) { supported++; - RESERVE_SPACE(4 + 4 + info.oid.len + 4 + 4); + RESERVE_SPACE(4 + 4 + XDR_LEN(info.oid.len) + 4 + 4); WRITE32(RPC_AUTH_GSS); WRITE32(info.oid.len); WRITEMEM(info.oid.data, info.oid.len); @@ -3379,35 +3339,43 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, 8 /* eir_clientid */ + 4 /* eir_sequenceid */ + 4 /* eir_flags */ + - 4 /* spr_how */ + - 8 /* spo_must_enforce, spo_must_allow */ + - 8 /* so_minor_id */ + - 4 /* so_major_id.len */ + - (XDR_QUADLEN(major_id_sz) * 4) + - 4 /* eir_server_scope.len */ + - (XDR_QUADLEN(server_scope_sz) * 4) + - 4 /* eir_server_impl_id.count (0) */); + 4 /* spr_how */); WRITEMEM(&exid->clientid, 8); WRITE32(exid->seqid); WRITE32(exid->flags); WRITE32(exid->spa_how); + ADJUST_ARGS(); + switch (exid->spa_how) { case SP4_NONE: break; case SP4_MACH_CRED: + /* spo_must_enforce, spo_must_allow */ + RESERVE_SPACE(16); + /* spo_must_enforce bitmap: */ WRITE32(2); WRITE32(nfs4_minimal_spo_must_enforce[0]); WRITE32(nfs4_minimal_spo_must_enforce[1]); /* empty spo_must_allow bitmap: */ WRITE32(0); + + ADJUST_ARGS(); break; default: WARN_ON_ONCE(1); } + RESERVE_SPACE( + 8 /* so_minor_id */ + + 4 /* so_major_id.len */ + + (XDR_QUADLEN(major_id_sz) * 4) + + 4 /* eir_server_scope.len */ + + (XDR_QUADLEN(server_scope_sz) * 4) + + 4 /* eir_server_impl_id.count (0) */); + /* The server_owner struct */ WRITE64(minor_id); /* Minor id */ /* major id */ @@ -3474,28 +3442,6 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 -nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_destroy_session *destroy_session) -{ - return nfserr; -} - -static __be32 -nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_free_stateid *free_stateid) -{ - __be32 *p; - - if (nfserr) - return nfserr; - - RESERVE_SPACE(4); - *p++ = nfserr; - ADJUST_ARGS(); - return nfserr; -} - -static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_sequence *seq) { @@ -3593,8 +3539,8 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session, [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, - [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, - [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_free_stateid, + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index b6af150c96b8..f8f060ffbf4f 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -132,13 +132,6 @@ nfsd_reply_cache_alloc(void) } static void -nfsd_reply_cache_unhash(struct svc_cacherep *rp) -{ - hlist_del_init(&rp->c_hash); - list_del_init(&rp->c_lru); -} - -static void nfsd_reply_cache_free_locked(struct svc_cacherep *rp) { if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { @@ -416,22 +409,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) /* * Since the common case is a cache miss followed by an insert, - * preallocate an entry. First, try to reuse the first entry on the LRU - * if it works, then go ahead and prune the LRU list. + * preallocate an entry. */ - spin_lock(&cache_lock); - if (!list_empty(&lru_head)) { - rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru); - if (nfsd_cache_entry_expired(rp) || - num_drc_entries >= max_drc_entries) { - nfsd_reply_cache_unhash(rp); - prune_cache_entries(); - goto search_cache; - } - } - - /* No expired ones available, allocate a new one. */ - spin_unlock(&cache_lock); rp = nfsd_reply_cache_alloc(); spin_lock(&cache_lock); if (likely(rp)) { @@ -439,7 +418,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) drc_mem_usage += sizeof(*rp); } -search_cache: + /* go ahead and prune the cache */ + prune_cache_entries(); + found = nfsd_cache_search(rqstp, csum); if (found) { if (likely(rp)) @@ -453,15 +434,6 @@ search_cache: goto out; } - /* - * We're keeping the one we just allocated. Are we now over the - * limit? Prune one off the tip of the LRU in trade for the one we - * just allocated if so. - */ - if (num_drc_entries >= max_drc_entries) - nfsd_reply_cache_free_locked(list_first_entry(&lru_head, - struct svc_cacherep, c_lru)); - nfsdstats.rcmisses++; rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 760c85a6f534..9a4a5f9e7468 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -241,6 +241,15 @@ static void nfsd_shutdown_generic(void) nfsd_racache_shutdown(); } +static bool nfsd_needs_lockd(void) +{ +#if defined(CONFIG_NFSD_V3) + return (nfsd_versions[2] != NULL) || (nfsd_versions[3] != NULL); +#else + return (nfsd_versions[2] != NULL); +#endif +} + static int nfsd_startup_net(int nrservs, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -255,9 +264,14 @@ static int nfsd_startup_net(int nrservs, struct net *net) ret = nfsd_init_socks(net); if (ret) goto out_socks; - ret = lockd_up(net); - if (ret) - goto out_socks; + + if (nfsd_needs_lockd() && !nn->lockd_up) { + ret = lockd_up(net); + if (ret) + goto out_socks; + nn->lockd_up = 1; + } + ret = nfs4_state_start_net(net); if (ret) goto out_lockd; @@ -266,7 +280,10 @@ static int nfsd_startup_net(int nrservs, struct net *net) return 0; out_lockd: - lockd_down(net); + if (nn->lockd_up) { + lockd_down(net); + nn->lockd_up = 0; + } out_socks: nfsd_shutdown_generic(); return ret; @@ -277,7 +294,10 @@ static void nfsd_shutdown_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs4_state_shutdown_net(net); - lockd_down(net); + if (nn->lockd_up) { + lockd_down(net); + nn->lockd_up = 0; + } nn->nfsd_net_up = false; nfsd_shutdown_generic(); } diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 9c769a47ac5a..b17d93214d01 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -152,7 +152,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, type = (stat->mode & S_IFMT); *p++ = htonl(nfs_ftypes[type >> 12]); - *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) (stat->mode & S_IALLUGO)); *p++ = htonl((u32) stat->nlink); *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 1426eb66c8c6..017d3cb5e99b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -207,7 +207,12 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_nfserr; } } else { - fh_lock(fhp); + /* + * In the nfsd4_open() case, this may be held across + * subsequent open and delegation acquisition which may + * need to take the child's i_mutex: + */ + fh_lock_nested(fhp, I_MUTEX_PARENT); dentry = lookup_one_len(name, dparent, len); host_err = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -273,13 +278,6 @@ out: return err; } -static int nfsd_break_lease(struct inode *inode) -{ - if (!S_ISREG(inode->i_mode)) - return 0; - return break_lease(inode, O_WRONLY | O_NONBLOCK); -} - /* * Commit metadata changes to stable storage. */ @@ -348,8 +346,7 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) /* Revoke setuid/setgid on chown */ if (!S_ISDIR(inode->i_mode) && - (((iap->ia_valid & ATTR_UID) && !uid_eq(iap->ia_uid, inode->i_uid)) || - ((iap->ia_valid & ATTR_GID) && !gid_eq(iap->ia_gid, inode->i_gid)))) { + ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) { iap->ia_valid |= ATTR_KILL_PRIV; if (iap->ia_valid & ATTR_MODE) { /* we're setting mode too, just clear the s*id bits */ @@ -449,16 +446,10 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, goto out_put_write_access; } - host_err = nfsd_break_lease(inode); - if (host_err) - goto out_put_write_access_nfserror; - fh_lock(fhp); host_err = notify_change(dentry, iap, NULL); fh_unlock(fhp); -out_put_write_access_nfserror: - err = nfserrno(host_err); out_put_write_access: if (size_change) put_write_access(inode); @@ -1609,11 +1600,6 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserr_noent; if (!dold->d_inode) goto out_dput; - host_err = nfsd_break_lease(dold->d_inode); - if (host_err) { - err = nfserrno(host_err); - goto out_dput; - } host_err = vfs_link(dold, dirp, dnew, NULL); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); @@ -1707,14 +1693,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) goto out_dput_new; - host_err = nfsd_break_lease(odentry->d_inode); - if (host_err) - goto out_dput_new; - if (ndentry->d_inode) { - host_err = nfsd_break_lease(ndentry->d_inode); - if (host_err) - goto out_dput_new; - } host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); if (!host_err) { host_err = commit_metadata(tfhp); @@ -1784,16 +1762,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = rdentry->d_inode->i_mode & S_IFMT; - host_err = nfsd_break_lease(rdentry->d_inode); - if (host_err) - goto out_put; if (type != S_IFDIR) host_err = vfs_unlink(dirp, rdentry, NULL); else host_err = vfs_rmdir(dirp, rdentry); if (!host_err) host_err = commit_metadata(fhp); -out_put: dput(rdentry); out_nfserr: diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 1bc1d440a1a5..fbe90bdb2214 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -86,8 +86,6 @@ __be32 nfsd_link(struct svc_rqst *, struct svc_fh *, __be32 nfsd_rename(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *, char *, int); -__be32 nfsd_remove(struct svc_rqst *, - struct svc_fh *, char *, int); __be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, char *name, int len); __be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index b6d5542a4ac8..335e04aaf7db 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -174,6 +174,9 @@ struct nfsd3_linkres { struct nfsd3_readdirres { __be32 status; struct svc_fh fh; + /* Just to save kmalloc on every readdirplus entry (svc_fh is a + * little large for the stack): */ + struct svc_fh scratch; int count; __be32 verf[2]; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index b3ed6446ed8e..d278a0d03496 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -228,7 +228,7 @@ struct nfsd4_open { u32 op_create; /* request */ u32 op_createmode; /* request */ u32 op_bmval[3]; /* request */ - struct iattr iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ + struct iattr op_iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ nfs4_verifier op_verf __attribute__((aligned(32))); /* EXCLUSIVE4 */ clientid_t op_clientid; /* request */ @@ -250,7 +250,6 @@ struct nfsd4_open { struct nfs4_acl *op_acl; struct xdr_netobj op_label; }; -#define op_iattr iattr struct nfsd4_open_confirm { stateid_t oc_req_stateid /* request */; @@ -374,7 +373,6 @@ struct nfsd4_test_stateid { struct nfsd4_free_stateid { stateid_t fr_stateid; /* request */ - __be32 fr_status; /* response */ }; /* also used for NVERIFY */ diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 2d8be51f90dc..dc3a9efdaab8 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -416,7 +416,8 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, } if (likely(bio)) { bio->bi_bdev = nilfs->ns_bdev; - bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9); + bio->bi_iter.bi_sector = + start << (nilfs->ns_blocksize_bits - 9); } return bio; } diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 0b9ff4395e6a..abc8cbcfe90e 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -86,7 +86,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { struct dnotify_mark *dn_mark; struct dnotify_struct *dn; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 58772623f02a..dc638f786d5c 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -16,12 +16,6 @@ static bool should_merge(struct fsnotify_event *old_fsn, { struct fanotify_event_info *old, *new; -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - /* dont merge two permission events */ - if ((old_fsn->mask & FAN_ALL_PERM_EVENTS) && - (new_fsn->mask & FAN_ALL_PERM_EVENTS)) - return false; -#endif pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); old = FANOTIFY_E(old_fsn); new = FANOTIFY_E(new_fsn); @@ -34,14 +28,23 @@ static bool should_merge(struct fsnotify_event *old_fsn, } /* and the list better be locked by something too! */ -static struct fsnotify_event *fanotify_merge(struct list_head *list, - struct fsnotify_event *event) +static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) { struct fsnotify_event *test_event; bool do_merge = false; pr_debug("%s: list=%p event=%p\n", __func__, list, event); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + /* + * Don't merge a permission event with any other event so that we know + * the event structure we have created in fanotify_handle_event() is the + * one we should check for permission response. + */ + if (event->mask & FAN_ALL_PERM_EVENTS) + return 0; +#endif + list_for_each_entry_reverse(test_event, list, list) { if (should_merge(test_event, event)) { do_merge = true; @@ -50,10 +53,10 @@ static struct fsnotify_event *fanotify_merge(struct list_head *list, } if (!do_merge) - return NULL; + return 0; test_event->mask |= event->mask; - return test_event; + return 1; } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS @@ -144,12 +147,11 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *fanotify_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { int ret = 0; struct fanotify_event_info *event; struct fsnotify_event *fsn_event; - struct fsnotify_event *notify_fsn_event; BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); @@ -188,21 +190,21 @@ static int fanotify_handle_event(struct fsnotify_group *group, event->response = 0; #endif - notify_fsn_event = fsnotify_add_notify_event(group, fsn_event, - fanotify_merge); - if (notify_fsn_event) { + ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); + if (ret) { + /* Permission events shouldn't be merged */ + BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS); /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); - if (IS_ERR(notify_fsn_event)) - return PTR_ERR(notify_fsn_event); - /* We need to ask about a different events after a merge... */ - event = FANOTIFY_E(notify_fsn_event); - fsn_event = notify_fsn_event; + + return 0; } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (fsn_event->mask & FAN_ALL_PERM_EVENTS) + if (mask & FAN_ALL_PERM_EVENTS) { ret = fanotify_get_response_from_access(group, event); + fsnotify_destroy_event(group, fsn_event); + } #endif return ret; } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 0e90174a116a..32a2f034fb94 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -4,6 +4,13 @@ extern struct kmem_cache *fanotify_event_cachep; +/* + * Lifetime of the structure differs for normal and permission events. In both + * cases the structure is allocated in fanotify_handle_event(). For normal + * events the structure is freed immediately after reporting it to userspace. + * For permission events we free it only after we receive response from + * userspace. + */ struct fanotify_event_info { struct fsnotify_event fse; /* diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 1fd66abe5740..287a22c04149 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -319,7 +319,12 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, if (IS_ERR(kevent)) break; ret = copy_event_to_user(group, kevent, buf); - fsnotify_destroy_event(group, kevent); + /* + * Permission events get destroyed after we + * receive response + */ + if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) + fsnotify_destroy_event(group, kevent); if (ret < 0) break; buf += ret; @@ -693,6 +698,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) struct fsnotify_group *group; int f_flags, fd; struct user_struct *user; + struct fanotify_event_info *oevent; pr_debug("%s: flags=%d event_f_flags=%d\n", __func__, flags, event_f_flags); @@ -725,8 +731,20 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); + oevent = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + if (unlikely(!oevent)) { + fd = -ENOMEM; + goto out_destroy_group; + } + group->overflow_event = &oevent->fse; + fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW); + oevent->tgid = get_pid(task_tgid(current)); + oevent->path.mnt = NULL; + oevent->path.dentry = NULL; + group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + oevent->response = 0; mutex_init(&group->fanotify_data.access_mutex); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 1d4e1ea2f37c..9d3e9c50066a 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -179,7 +179,7 @@ static int send_to_group(struct inode *to_tell, return group->ops->handle_event(group, to_tell, inode_mark, vfsmount_mark, mask, data, data_is, - file_name); + file_name, cookie); } /* diff --git a/fs/notify/group.c b/fs/notify/group.c index ee674fe2cec7..ad1995980456 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -55,6 +55,13 @@ void fsnotify_destroy_group(struct fsnotify_group *group) /* clear the notification queue of all events */ fsnotify_flush_notify(group); + /* + * Destroy overflow event (we cannot use fsnotify_destroy_event() as + * that deliberately ignores overflow events. + */ + if (group->overflow_event) + group->ops->free_event(group->overflow_event); + fsnotify_put_group(group); } @@ -99,7 +106,6 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) INIT_LIST_HEAD(&group->marks_list); group->ops = ops; - fsnotify_init_event(&group->overflow_event, NULL, FS_Q_OVERFLOW); return group; } diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 485eef3f4407..ed855ef6f077 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -27,6 +27,6 @@ extern int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name); + const unsigned char *file_name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index aad1a35e9af1..43ab1e1a07a2 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -53,15 +53,13 @@ static bool event_compare(struct fsnotify_event *old_fsn, return false; } -static struct fsnotify_event *inotify_merge(struct list_head *list, - struct fsnotify_event *event) +static int inotify_merge(struct list_head *list, + struct fsnotify_event *event) { struct fsnotify_event *last_event; last_event = list_entry(list->prev, struct fsnotify_event, list); - if (!event_compare(last_event, event)) - return NULL; - return last_event; + return event_compare(last_event, event); } int inotify_handle_event(struct fsnotify_group *group, @@ -69,13 +67,12 @@ int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { struct inotify_inode_mark *i_mark; struct inotify_event_info *event; - struct fsnotify_event *added_event; struct fsnotify_event *fsn_event; - int ret = 0; + int ret; int len = 0; int alloc_len = sizeof(struct inotify_event_info); @@ -106,22 +103,21 @@ int inotify_handle_event(struct fsnotify_group *group, fsn_event = &event->fse; fsnotify_init_event(fsn_event, inode, mask); event->wd = i_mark->wd; + event->sync_cookie = cookie; event->name_len = len; if (len) strcpy(event->name, file_name); - added_event = fsnotify_add_notify_event(group, fsn_event, inotify_merge); - if (added_event) { + ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge); + if (ret) { /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); - if (IS_ERR(added_event)) - ret = PTR_ERR(added_event); } if (inode_mark->mask & IN_ONESHOT) fsnotify_destroy_mark(inode_mark, group); - return ret; + return 0; } static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 497395c8274b..78a2ca3966c3 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -495,7 +495,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, /* Queue ignore event for the watch */ inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, - NULL, FSNOTIFY_EVENT_NONE, NULL); + NULL, FSNOTIFY_EVENT_NONE, NULL, 0); i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ @@ -633,11 +633,23 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod static struct fsnotify_group *inotify_new_group(unsigned int max_events) { struct fsnotify_group *group; + struct inotify_event_info *oevent; group = fsnotify_alloc_group(&inotify_fsnotify_ops); if (IS_ERR(group)) return group; + oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL); + if (unlikely(!oevent)) { + fsnotify_destroy_group(group); + return ERR_PTR(-ENOMEM); + } + group->overflow_event = &oevent->fse; + fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW); + oevent->wd = -1; + oevent->sync_cookie = 0; + oevent->name_len = 0; + group->max_events = max_events; spin_lock_init(&group->inotify_data.idr_lock); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 952237b8e2d2..1e58402171a5 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -79,15 +79,16 @@ void fsnotify_destroy_event(struct fsnotify_group *group, /* * Add an event to the group notification queue. The group can later pull this - * event off the queue to deal with. If the event is successfully added to the - * group's notification queue, a reference is taken on event. + * event off the queue to deal with. The function returns 0 if the event was + * added to the queue, 1 if the event was merged with some other queued event, + * 2 if the queue of events has overflown. */ -struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, - struct fsnotify_event *event, - struct fsnotify_event *(*merge)(struct list_head *, - struct fsnotify_event *)) +int fsnotify_add_notify_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct list_head *, + struct fsnotify_event *)) { - struct fsnotify_event *return_event = NULL; + int ret = 0; struct list_head *list = &group->notification_list; pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -95,27 +96,32 @@ struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, mutex_lock(&group->notification_mutex); if (group->q_len >= group->max_events) { + ret = 2; /* Queue overflow event only if it isn't already queued */ - if (list_empty(&group->overflow_event.list)) - event = &group->overflow_event; - return_event = event; + if (!list_empty(&group->overflow_event->list)) { + mutex_unlock(&group->notification_mutex); + return ret; + } + event = group->overflow_event; + goto queue; } if (!list_empty(list) && merge) { - return_event = merge(list, event); - if (return_event) { + ret = merge(list, event); + if (ret) { mutex_unlock(&group->notification_mutex); - return return_event; + return ret; } } +queue: group->q_len++; list_add_tail(&event->list, list); mutex_unlock(&group->notification_mutex); wake_up(&group->notification_waitq); kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); - return return_event; + return ret; } /* @@ -132,7 +138,11 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group event = list_first_entry(&group->notification_list, struct fsnotify_event, list); - list_del(&event->list); + /* + * We need to init list head for the case of overflow event so that + * check in fsnotify_add_notify_events() works + */ + list_del_init(&event->list); group->q_len--; return event; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index ea4ba9daeb47..db9bd8a31725 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2134,7 +2134,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); if (ret > 0) { - int err = generic_write_sync(file, pos, ret); + int err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 8750ae1b8636..e2edff38be52 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4742,6 +4742,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, enum ocfs2_alloc_restarted *reason_ret) { int status = 0, err = 0; + int need_free = 0; int free_extents; enum ocfs2_alloc_restarted reason = RESTART_NONE; u32 bit_off, num_bits; @@ -4796,7 +4797,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); - goto leave; + need_free = 1; + goto bail; } block = ocfs2_clusters_to_blocks(osb->sb, bit_off); @@ -4807,7 +4809,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, num_bits, flags, meta_ac); if (status < 0) { mlog_errno(status); - goto leave; + need_free = 1; + goto bail; } ocfs2_journal_dirty(handle, et->et_root_bh); @@ -4821,6 +4824,19 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, reason = RESTART_TRANS; } +bail: + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num_bits); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num_bits); + } + leave: if (reason_ret) *reason_ret = reason; @@ -6805,6 +6821,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct buffer_head *di_bh) { int ret, i, has_data, num_pages = 0; + int need_free = 0; + u32 bit_off, num; handle_t *handle; u64 uninitialized_var(block); struct ocfs2_inode_info *oi = OCFS2_I(inode); @@ -6850,7 +6868,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } if (has_data) { - u32 bit_off, num; unsigned int page_end; u64 phys; @@ -6886,6 +6903,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6896,6 +6914,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_read_inline_data(inode, pages[0], di_bh); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6927,6 +6946,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6938,6 +6958,18 @@ out_commit: dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num); + } + ocfs2_commit_trans(osb, handle); out_unlock: @@ -7126,7 +7158,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, if (end > i_size_read(inode)) end = i_size_read(inode); - BUG_ON(start >= end); + BUG_ON(start > end); if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 73920ffda05b..bf482dfed14f 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -413,7 +413,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, } /* Must put everything in 512 byte sectors for the bio... */ - bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9); + bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9); bio->bi_bdev = reg->hr_bdev; bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d77d71ead8d1..8450262bcf2a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -185,6 +185,9 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, file->f_path.dentry->d_name.name, (unsigned long long)datasync); + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) return err; @@ -474,11 +477,6 @@ static int ocfs2_truncate_file(struct inode *inode, goto bail; } - /* lets handle the simple truncate cases before doing any more - * cluster locking. */ - if (new_i_size == le64_to_cpu(fe->i_size)) - goto bail; - down_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_resv_discard(&osb->osb_la_resmap, @@ -718,7 +716,8 @@ leave: * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. */ -static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, + struct buffer_head *di_bh) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; @@ -735,7 +734,14 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) } ret = ocfs2_jbd2_file_inode(handle, inode); - if (ret < 0) + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) mlog_errno(ret); out: @@ -751,7 +757,7 @@ out: * to be too fragile to do exactly what we need without us having to * worry about recursive locking in ->write_begin() and ->write_end(). */ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, - u64 abs_to) + u64 abs_to, struct buffer_head *di_bh) { struct address_space *mapping = inode->i_mapping; struct page *page; @@ -759,6 +765,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, handle_t *handle = NULL; int ret = 0; unsigned zero_from, zero_to, block_start, block_end; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; BUG_ON(abs_from >= abs_to); BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); @@ -801,7 +808,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, } if (!handle) { - handle = ocfs2_zero_start_ordered_transaction(inode); + handle = ocfs2_zero_start_ordered_transaction(inode, + di_bh); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; @@ -818,8 +826,22 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, ret = 0; } - if (handle) + if (handle) { + /* + * fs-writeback will release the dirty pages without page lock + * whose offset are over inode size, the release happens at + * block_write_full_page_endio(). + */ + i_size_write(inode, abs_to); + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + di->i_mtime_nsec = di->i_ctime_nsec; + ocfs2_journal_dirty(handle, di_bh); ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + } out_unlock: unlock_page(page); @@ -915,7 +937,7 @@ out: * has made sure that the entire range needs zeroing. */ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, - u64 range_end) + u64 range_end, struct buffer_head *di_bh) { int rc = 0; u64 next_pos; @@ -931,7 +953,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; if (next_pos > range_end) next_pos = range_end; - rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); + rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh); if (rc < 0) { mlog_errno(rc); break; @@ -977,7 +999,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, range_end = zero_to_size; ret = ocfs2_zero_extend_range(inode, range_start, - range_end); + range_end, di_bh); if (ret) { mlog_errno(ret); break; @@ -1145,14 +1167,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) goto bail_unlock_rw; } - if (size_change && attr->ia_size != i_size_read(inode)) { + if (size_change) { status = inode_newsize_ok(inode, attr->ia_size); if (status) goto bail_unlock; inode_dio_wait(inode); - if (i_size_read(inode) > attr->ia_size) { + if (i_size_read(inode) >= attr->ia_size) { if (ocfs2_should_order_data(inode)) { status = ocfs2_begin_ordered_truncate(inode, attr->ia_size); diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index cd5496b7a0a3..044013455621 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -781,6 +781,48 @@ bail: return status; } +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bit_off, + u32 num_bits) +{ + int status, start; + u32 clear_bits; + struct inode *local_alloc_inode; + void *bitmap; + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); + + local_alloc_inode = ac->ac_inode; + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + bitmap = la->la_bitmap; + start = bit_off - le32_to_cpu(la->la_bm_off); + clear_bits = num_bits; + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while (clear_bits--) + ocfs2_clear_bit(start++, bitmap); + + le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits); + ocfs2_journal_dirty(handle, osb->local_alloc_bh); + +bail: + return status; +} + static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) { u32 count; diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index 1be9b5864460..44a7d1fb2dec 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -55,6 +55,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, u32 *bit_off, u32 *num_bits); +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + u32 bit_off, + u32 num_bits); + void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, unsigned int num_clusters); void ocfs2_la_enable_worker(struct work_struct *work); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f4d609be9400..3683643f3f0e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -664,6 +664,7 @@ static int ocfs2_link(struct dentry *old_dentry, struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; + u64 old_de_ino; trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno, old_dentry->d_name.len, old_dentry->d_name.name, @@ -686,6 +687,22 @@ static int ocfs2_link(struct dentry *old_dentry, goto out; } + err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name, + old_dentry->d_name.len, &old_de_ino); + if (err) { + err = -ENOENT; + goto out; + } + + /* + * Check whether another node removed the source inode while we + * were in the vfs. + */ + if (old_de_ino != OCFS2_I(inode)->ip_blkno) { + err = -ENOENT; + goto out; + } + err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, dentry->d_name.len); if (err) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index aaa50611ec66..d7b5108789e2 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -717,6 +717,12 @@ static int ocfs2_release_dquot(struct dquot *dquot) */ if (status < 0) mlog_errno(status); + /* + * Clear dq_off so that we search for the structure in quota file next + * time we acquire it. The structure might be deleted and reallocated + * elsewhere by another node while our dquot structure is on freelist. + */ + dquot->dq_off = 0; clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_trans: ocfs2_commit_trans(osb, handle); @@ -756,16 +762,17 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) status = ocfs2_lock_global_qf(info, 1); if (status < 0) goto out; - if (!test_bit(DQ_READ_B, &dquot->dq_flags)) { - status = ocfs2_qinfo_lock(info, 0); - if (status < 0) - goto out_dq; - status = qtree_read_dquot(&info->dqi_gi, dquot); - ocfs2_qinfo_unlock(info, 0); - if (status < 0) - goto out_dq; - } - set_bit(DQ_READ_B, &dquot->dq_flags); + status = ocfs2_qinfo_lock(info, 0); + if (status < 0) + goto out_dq; + /* + * We always want to read dquot structure from disk because we don't + * know what happened with it while it was on freelist. + */ + status = qtree_read_dquot(&info->dqi_gi, dquot); + ocfs2_qinfo_unlock(info, 0); + if (status < 0) + goto out_dq; OCFS2_DQUOT(dquot)->dq_use_count++; OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 2e4344be3b96..2001862bf2b1 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -1303,10 +1303,6 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); out: - /* Clear the read bit so that next time someone uses this - * dquot he reads fresh info from disk and allocates local - * dquot structure */ - clear_bit(DQ_READ_B, &dquot->dq_flags); return status; } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 38bae5a0ea25..11c54fd51e16 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -521,8 +521,11 @@ posix_acl_chmod(struct inode *inode, umode_t mode) return -EOPNOTSUPP; acl = get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR_OR_NULL(acl)) + if (IS_ERR_OR_NULL(acl)) { + if (acl == ERR_PTR(-EOPNOTSUPP)) + return 0; return PTR_ERR(acl); + } ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) @@ -544,14 +547,15 @@ posix_acl_create(struct inode *dir, umode_t *mode, goto no_acl; p = get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(p)) + if (IS_ERR(p)) { + if (p == ERR_PTR(-EOPNOTSUPP)) + goto apply_umask; return PTR_ERR(p); - - if (!p) { - *mode &= ~current_umask(); - goto no_acl; } + if (!p) + goto apply_umask; + *acl = posix_acl_clone(p, GFP_NOFS); if (!*acl) return -ENOMEM; @@ -575,6 +579,8 @@ posix_acl_create(struct inode *dir, umode_t *mode, } return 0; +apply_umask: + *mode &= ~current_umask(); no_acl: *default_acl = NULL; *acl = NULL; diff --git a/fs/proc/page.c b/fs/proc/page.c index 02174a610315..e647c55275d9 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -121,9 +121,8 @@ u64 stable_page_flags(struct page *page) * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon * to make sure a given page is a thp, not a non-huge compound page. */ - else if (PageTransCompound(page) && - (PageLRU(compound_trans_head(page)) || - PageAnon(compound_trans_head(page)))) + else if (PageTransCompound(page) && (PageLRU(compound_head(page)) || + PageAnon(compound_head(page)))) u |= 1 << KPF_THP; /* diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 2ca7ba047f04..88d4585b30f1 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -468,17 +468,24 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) return rc; } nhdr_ptr = notes_section; - while (real_sz < max_sz) { - if (nhdr_ptr->n_namesz == 0) - break; + while (nhdr_ptr->n_namesz != 0) { sz = sizeof(Elf64_Nhdr) + ((nhdr_ptr->n_namesz + 3) & ~3) + ((nhdr_ptr->n_descsz + 3) & ~3); + if ((real_sz + sz) > max_sz) { + pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", + nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); + break; + } real_sz += sz; nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); } kfree(notes_section); phdr_ptr->p_memsz = real_sz; + if (real_sz == 0) { + pr_warn("Warning: Zero PT_NOTE entries found\n"); + return -EINVAL; + } } return 0; @@ -648,17 +655,24 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) return rc; } nhdr_ptr = notes_section; - while (real_sz < max_sz) { - if (nhdr_ptr->n_namesz == 0) - break; + while (nhdr_ptr->n_namesz != 0) { sz = sizeof(Elf32_Nhdr) + ((nhdr_ptr->n_namesz + 3) & ~3) + ((nhdr_ptr->n_descsz + 3) & ~3); + if ((real_sz + sz) > max_sz) { + pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", + nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); + break; + } real_sz += sz; nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); } kfree(notes_section); phdr_ptr->p_memsz = real_sz; + if (real_sz == 0) { + pr_warn("Warning: Zero PT_NOTE entries found\n"); + return -EINVAL; + } } return 0; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 831d49a4111f..cfc8dcc16043 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -581,9 +581,17 @@ int dquot_scan_active(struct super_block *sb, dqstats_inc(DQST_LOOKUPS); dqput(old_dquot); old_dquot = dquot; - ret = fn(dquot, priv); - if (ret < 0) - goto out; + /* + * ->release_dquot() can be racing with us. Our reference + * protects us from new calls to it so just wait for any + * outstanding call and recheck the DQ_ACTIVE_B after that. + */ + wait_on_dquot(dquot); + if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + ret = fn(dquot, priv); + if (ret < 0) + goto out; + } spin_lock(&dq_list_lock); /* We are safe to continue now because our dquot could not * be moved out of the inuse list while we hold the reference */ diff --git a/fs/read_write.c b/fs/read_write.c index 1193ffd03565..edc5746a902a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -964,9 +964,9 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd, +COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen) + compat_ulong_t, vlen) { struct fd f = fdget(fd); ssize_t ret; @@ -1001,9 +1001,9 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, return ret; } -COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd, +COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, u32, pos_low, u32, pos_high) + compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return compat_sys_preadv64(fd, vec, vlen, pos); @@ -1031,9 +1031,9 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd, +COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, const struct compat_iovec __user *, vec, - unsigned long, vlen) + compat_ulong_t, vlen) { struct fd f = fdget(fd); ssize_t ret; @@ -1068,9 +1068,9 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, return ret; } -COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd, +COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, u32, pos_low, u32, pos_high) + compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return compat_sys_pwritev64(fd, vec, vlen, pos); diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 2b7882b508db..9a3c68cf6026 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -324,23 +324,17 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h switch (flag) { case M_INSERT: /* insert item into L[0] */ - if (item_pos == tb->lnum[0] - 1 - && tb->lbytes != -1) { + if (item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { /* part of new item falls into L[0] */ int new_item_len; int version; - ret_val = - leaf_shift_left(tb, tb->lnum[0] - 1, - -1); + ret_val = leaf_shift_left(tb, tb->lnum[0] - 1, -1); /* Calculate item length to insert to S[0] */ - new_item_len = - ih_item_len(ih) - tb->lbytes; + new_item_len = ih_item_len(ih) - tb->lbytes; /* Calculate and check item length to insert to L[0] */ - put_ih_item_len(ih, - ih_item_len(ih) - - new_item_len); + put_ih_item_len(ih, ih_item_len(ih) - new_item_len); RFALSE(ih_item_len(ih) <= 0, "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d", @@ -349,30 +343,18 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h /* Insert new item into L[0] */ buffer_info_init_left(tb, &bi); leaf_insert_into_buf(&bi, - n + item_pos - - ret_val, ih, body, - zeros_num > - ih_item_len(ih) ? - ih_item_len(ih) : - zeros_num); + n + item_pos - ret_val, ih, body, + zeros_num > ih_item_len(ih) ? ih_item_len(ih) : zeros_num); version = ih_version(ih); /* Calculate key component, item length and body to insert into S[0] */ - set_le_ih_k_offset(ih, - le_ih_k_offset(ih) + - (tb-> - lbytes << - (is_indirect_le_ih - (ih) ? tb->tb_sb-> - s_blocksize_bits - - UNFM_P_SHIFT : - 0))); + set_le_ih_k_offset(ih, le_ih_k_offset(ih) + + (tb-> lbytes << (is_indirect_le_ih(ih) ? tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT : 0))); put_ih_item_len(ih, new_item_len); if (tb->lbytes > zeros_num) { - body += - (tb->lbytes - zeros_num); + body += (tb->lbytes - zeros_num); zeros_num = 0; } else zeros_num -= tb->lbytes; @@ -383,15 +365,10 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h } else { /* new item in whole falls into L[0] */ /* Shift lnum[0]-1 items to L[0] */ - ret_val = - leaf_shift_left(tb, tb->lnum[0] - 1, - tb->lbytes); + ret_val = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes); /* Insert new item into L[0] */ buffer_info_init_left(tb, &bi); - leaf_insert_into_buf(&bi, - n + item_pos - - ret_val, ih, body, - zeros_num); + leaf_insert_into_buf(&bi, n + item_pos - ret_val, ih, body, zeros_num); tb->insert_size[0] = 0; zeros_num = 0; } @@ -399,264 +376,117 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h case M_PASTE: /* append item in L[0] */ - if (item_pos == tb->lnum[0] - 1 - && tb->lbytes != -1) { + if (item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { /* we must shift the part of the appended item */ - if (is_direntry_le_ih - (B_N_PITEM_HEAD(tbS0, item_pos))) { + if (is_direntry_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) { RFALSE(zeros_num, "PAP-12090: invalid parameter in case of a directory"); /* directory item */ if (tb->lbytes > pos_in_item) { /* new directory entry falls into L[0] */ - struct item_head - *pasted; - int l_pos_in_item = - pos_in_item; + struct item_head *pasted; + int l_pos_in_item = pos_in_item; /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */ - ret_val = - leaf_shift_left(tb, - tb-> - lnum - [0], - tb-> - lbytes - - - 1); - if (ret_val - && !item_pos) { - pasted = - B_N_PITEM_HEAD - (tb->L[0], - B_NR_ITEMS - (tb-> - L[0]) - - 1); - l_pos_in_item += - I_ENTRY_COUNT - (pasted) - - (tb-> - lbytes - - 1); + ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes-1); + if (ret_val && !item_pos) { + pasted = B_N_PITEM_HEAD(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1); + l_pos_in_item += I_ENTRY_COUNT(pasted) - (tb->lbytes -1); } /* Append given directory entry to directory item */ buffer_info_init_left(tb, &bi); - leaf_paste_in_buffer - (&bi, - n + item_pos - - ret_val, - l_pos_in_item, - tb->insert_size[0], - body, zeros_num); + leaf_paste_in_buffer(&bi, n + item_pos - ret_val, l_pos_in_item, tb->insert_size[0], body, zeros_num); /* previous string prepared space for pasting new entry, following string pastes this entry */ /* when we have merge directory item, pos_in_item has been changed too */ /* paste new directory entry. 1 is entry number */ - leaf_paste_entries(&bi, - n + - item_pos - - - ret_val, - l_pos_in_item, - 1, - (struct - reiserfs_de_head - *) - body, - body - + - DEH_SIZE, - tb-> - insert_size - [0] - ); + leaf_paste_entries(&bi, n + item_pos - ret_val, l_pos_in_item, + 1, (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); tb->insert_size[0] = 0; } else { /* new directory item doesn't fall into L[0] */ /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */ - leaf_shift_left(tb, - tb-> - lnum[0], - tb-> - lbytes); + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); } /* Calculate new position to append in item body */ pos_in_item -= tb->lbytes; } else { /* regular object */ - RFALSE(tb->lbytes <= 0, - "PAP-12095: there is nothing to shift to L[0]. lbytes=%d", - tb->lbytes); - RFALSE(pos_in_item != - ih_item_len - (B_N_PITEM_HEAD - (tbS0, item_pos)), + RFALSE(tb->lbytes <= 0, "PAP-12095: there is nothing to shift to L[0]. lbytes=%d", tb->lbytes); + RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)), "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d", - ih_item_len - (B_N_PITEM_HEAD - (tbS0, item_pos)), - pos_in_item); + ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)),pos_in_item); if (tb->lbytes >= pos_in_item) { /* appended item will be in L[0] in whole */ int l_n; /* this bytes number must be appended to the last item of L[h] */ - l_n = - tb->lbytes - - pos_in_item; + l_n = tb->lbytes - pos_in_item; /* Calculate new insert_size[0] */ - tb->insert_size[0] -= - l_n; + tb->insert_size[0] -= l_n; - RFALSE(tb-> - insert_size[0] <= - 0, + RFALSE(tb->insert_size[0] <= 0, "PAP-12105: there is nothing to paste into L[0]. insert_size=%d", - tb-> - insert_size[0]); - ret_val = - leaf_shift_left(tb, - tb-> - lnum - [0], - ih_item_len - (B_N_PITEM_HEAD - (tbS0, - item_pos))); + tb->insert_size[0]); + ret_val = leaf_shift_left(tb, tb->lnum[0], ih_item_len + (B_N_PITEM_HEAD(tbS0, item_pos))); /* Append to body of item in L[0] */ buffer_info_init_left(tb, &bi); leaf_paste_in_buffer - (&bi, - n + item_pos - - ret_val, - ih_item_len - (B_N_PITEM_HEAD - (tb->L[0], - n + item_pos - - ret_val)), l_n, - body, - zeros_num > - l_n ? l_n : - zeros_num); + (&bi, n + item_pos - ret_val, ih_item_len + (B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val)), + l_n, body, + zeros_num > l_n ? l_n : zeros_num); /* 0-th item in S0 can be only of DIRECT type when l_n != 0 */ { int version; - int temp_l = - l_n; - - RFALSE - (ih_item_len - (B_N_PITEM_HEAD - (tbS0, - 0)), + int temp_l = l_n; + + RFALSE(ih_item_len(B_N_PITEM_HEAD(tbS0, 0)), "PAP-12106: item length must be 0"); - RFALSE - (comp_short_le_keys - (B_N_PKEY - (tbS0, 0), - B_N_PKEY - (tb->L[0], - n + - item_pos - - - ret_val)), + RFALSE(comp_short_le_keys(B_N_PKEY(tbS0, 0), B_N_PKEY + (tb->L[0], n + item_pos - ret_val)), "PAP-12107: items must be of the same file"); if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val))) { - temp_l = - l_n - << - (tb-> - tb_sb-> - s_blocksize_bits - - - UNFM_P_SHIFT); + temp_l = l_n << (tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT); } /* update key of first item in S0 */ - version = - ih_version - (B_N_PITEM_HEAD - (tbS0, 0)); - set_le_key_k_offset - (version, - B_N_PKEY - (tbS0, 0), - le_key_k_offset - (version, - B_N_PKEY - (tbS0, - 0)) + - temp_l); + version = ih_version(B_N_PITEM_HEAD(tbS0, 0)); + set_le_key_k_offset(version, B_N_PKEY(tbS0, 0), + le_key_k_offset(version,B_N_PKEY(tbS0, 0)) + temp_l); /* update left delimiting key */ - set_le_key_k_offset - (version, - B_N_PDELIM_KEY - (tb-> - CFL[0], - tb-> - lkey[0]), - le_key_k_offset - (version, - B_N_PDELIM_KEY - (tb-> - CFL[0], - tb-> - lkey[0])) - + temp_l); + set_le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]), + le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0])) + temp_l); } /* Calculate new body, position in item and insert_size[0] */ if (l_n > zeros_num) { - body += - (l_n - - zeros_num); + body += (l_n - zeros_num); zeros_num = 0; } else - zeros_num -= - l_n; + zeros_num -= l_n; pos_in_item = 0; - RFALSE - (comp_short_le_keys - (B_N_PKEY(tbS0, 0), - B_N_PKEY(tb->L[0], - B_NR_ITEMS - (tb-> - L[0]) - - 1)) - || - !op_is_left_mergeable - (B_N_PKEY(tbS0, 0), - tbS0->b_size) - || - !op_is_left_mergeable - (B_N_PDELIM_KEY - (tb->CFL[0], - tb->lkey[0]), - tbS0->b_size), + RFALSE(comp_short_le_keys(B_N_PKEY(tbS0, 0), B_N_PKEY(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1)) + || !op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size) + || !op_is_left_mergeable(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]), tbS0->b_size), "PAP-12120: item must be merge-able with left neighboring item"); } else { /* only part of the appended item will be in L[0] */ /* Calculate position in item for append in S[0] */ - pos_in_item -= - tb->lbytes; + pos_in_item -= tb->lbytes; - RFALSE(pos_in_item <= 0, - "PAP-12125: no place for paste. pos_in_item=%d", - pos_in_item); + RFALSE(pos_in_item <= 0, "PAP-12125: no place for paste. pos_in_item=%d", pos_in_item); /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ - leaf_shift_left(tb, - tb-> - lnum[0], - tb-> - lbytes); + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); } } } else { /* appended item will be in L[0] in whole */ @@ -665,52 +495,30 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h if (!item_pos && op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)) { /* if we paste into first item of S[0] and it is left mergable */ /* then increment pos_in_item by the size of the last item in L[0] */ - pasted = - B_N_PITEM_HEAD(tb->L[0], - n - 1); + pasted = B_N_PITEM_HEAD(tb->L[0], n - 1); if (is_direntry_le_ih(pasted)) - pos_in_item += - ih_entry_count - (pasted); + pos_in_item += ih_entry_count(pasted); else - pos_in_item += - ih_item_len(pasted); + pos_in_item += ih_item_len(pasted); } /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ - ret_val = - leaf_shift_left(tb, tb->lnum[0], - tb->lbytes); + ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes); /* Append to body of item in L[0] */ buffer_info_init_left(tb, &bi); - leaf_paste_in_buffer(&bi, - n + item_pos - - ret_val, + leaf_paste_in_buffer(&bi, n + item_pos - ret_val, pos_in_item, tb->insert_size[0], body, zeros_num); /* if appended item is directory, paste entry */ - pasted = - B_N_PITEM_HEAD(tb->L[0], - n + item_pos - - ret_val); + pasted = B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val); if (is_direntry_le_ih(pasted)) - leaf_paste_entries(&bi, - n + - item_pos - - ret_val, - pos_in_item, - 1, - (struct - reiserfs_de_head - *)body, - body + - DEH_SIZE, - tb-> - insert_size - [0] - ); + leaf_paste_entries(&bi, n + item_pos - ret_val, + pos_in_item, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, + tb->insert_size[0]); /* if appended item is indirect item, put unformatted node into un list */ if (is_indirect_le_ih(pasted)) set_ih_free_space(pasted, 0); @@ -722,13 +530,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h reiserfs_panic(tb->tb_sb, "PAP-12130", "lnum > 0: unexpected mode: " " %s(%d)", - (flag == - M_DELETE) ? "DELETE" : ((flag == - M_CUT) - ? "CUT" - : - "UNKNOWN"), - flag); + (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); } } else { /* new item doesn't fall into L[0] */ @@ -748,14 +550,12 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h case M_INSERT: /* insert item */ if (n - tb->rnum[0] < item_pos) { /* new item or its part falls to R[0] */ if (item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { /* part of new item falls into R[0] */ - loff_t old_key_comp, old_len, - r_zeros_number; + loff_t old_key_comp, old_len, r_zeros_number; const char *r_body; int version; loff_t offset; - leaf_shift_right(tb, tb->rnum[0] - 1, - -1); + leaf_shift_right(tb, tb->rnum[0] - 1, -1); version = ih_version(ih); /* Remember key component and item length */ @@ -763,29 +563,17 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h old_len = ih_item_len(ih); /* Calculate key component and item length to insert into R[0] */ - offset = - le_ih_k_offset(ih) + - ((old_len - - tb-> - rbytes) << (is_indirect_le_ih(ih) - ? tb->tb_sb-> - s_blocksize_bits - - UNFM_P_SHIFT : 0)); + offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << (is_indirect_le_ih(ih) ? tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT : 0)); set_le_ih_k_offset(ih, offset); put_ih_item_len(ih, tb->rbytes); /* Insert part of the item into R[0] */ buffer_info_init_right(tb, &bi); if ((old_len - tb->rbytes) > zeros_num) { r_zeros_number = 0; - r_body = - body + (old_len - - tb->rbytes) - - zeros_num; + r_body = body + (old_len - tb->rbytes) - zeros_num; } else { r_body = body; - r_zeros_number = - zeros_num - (old_len - - tb->rbytes); + r_zeros_number = zeros_num - (old_len - tb->rbytes); zeros_num -= r_zeros_number; } @@ -798,25 +586,18 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h /* Calculate key component and item length to insert into S[0] */ set_le_ih_k_offset(ih, old_key_comp); - put_ih_item_len(ih, - old_len - tb->rbytes); + put_ih_item_len(ih, old_len - tb->rbytes); tb->insert_size[0] -= tb->rbytes; } else { /* whole new item falls into R[0] */ /* Shift rnum[0]-1 items to R[0] */ - ret_val = - leaf_shift_right(tb, - tb->rnum[0] - 1, - tb->rbytes); + ret_val = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); /* Insert new item into R[0] */ buffer_info_init_right(tb, &bi); - leaf_insert_into_buf(&bi, - item_pos - n + - tb->rnum[0] - 1, - ih, body, - zeros_num); + leaf_insert_into_buf(&bi, item_pos - n + tb->rnum[0] - 1, + ih, body, zeros_num); if (item_pos - n + tb->rnum[0] - 1 == 0) { replace_key(tb, tb->CFR[0], @@ -841,200 +622,97 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h RFALSE(zeros_num, "PAP-12145: invalid parameter in case of a directory"); - entry_count = - I_ENTRY_COUNT(B_N_PITEM_HEAD - (tbS0, - item_pos)); + entry_count = I_ENTRY_COUNT(B_N_PITEM_HEAD + (tbS0, item_pos)); if (entry_count - tb->rbytes < pos_in_item) /* new directory entry falls into R[0] */ { int paste_entry_position; - RFALSE(tb->rbytes - 1 >= - entry_count - || !tb-> - insert_size[0], + RFALSE(tb->rbytes - 1 >= entry_count || !tb-> insert_size[0], "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d", - tb->rbytes, - entry_count); + tb->rbytes, entry_count); /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */ - leaf_shift_right(tb, - tb-> - rnum - [0], - tb-> - rbytes - - 1); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1); /* Paste given directory entry to directory item */ - paste_entry_position = - pos_in_item - - entry_count + - tb->rbytes - 1; + paste_entry_position = pos_in_item - entry_count + tb->rbytes - 1; buffer_info_init_right(tb, &bi); - leaf_paste_in_buffer - (&bi, 0, - paste_entry_position, - tb->insert_size[0], - body, zeros_num); + leaf_paste_in_buffer(&bi, 0, paste_entry_position, tb->insert_size[0], body, zeros_num); /* paste entry */ - leaf_paste_entries(&bi, - 0, - paste_entry_position, - 1, - (struct - reiserfs_de_head - *) - body, - body - + - DEH_SIZE, - tb-> - insert_size - [0] - ); - - if (paste_entry_position - == 0) { + leaf_paste_entries(&bi, 0, paste_entry_position, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); + + if (paste_entry_position == 0) { /* change delimiting keys */ - replace_key(tb, - tb-> - CFR - [0], - tb-> - rkey - [0], - tb-> - R - [0], - 0); + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0],0); } tb->insert_size[0] = 0; pos_in_item++; } else { /* new directory entry doesn't fall into R[0] */ - leaf_shift_right(tb, - tb-> - rnum - [0], - tb-> - rbytes); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); } } else { /* regular object */ - int n_shift, n_rem, - r_zeros_number; + int n_shift, n_rem, r_zeros_number; const char *r_body; /* Calculate number of bytes which must be shifted from appended item */ - if ((n_shift = - tb->rbytes - - tb->insert_size[0]) < 0) + if ((n_shift = tb->rbytes - tb->insert_size[0]) < 0) n_shift = 0; - RFALSE(pos_in_item != - ih_item_len - (B_N_PITEM_HEAD - (tbS0, item_pos)), + RFALSE(pos_in_item != ih_item_len + (B_N_PITEM_HEAD(tbS0, item_pos)), "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d", - pos_in_item, - ih_item_len - (B_N_PITEM_HEAD - (tbS0, item_pos))); - - leaf_shift_right(tb, - tb->rnum[0], - n_shift); + pos_in_item, ih_item_len + (B_N_PITEM_HEAD(tbS0, item_pos))); + + leaf_shift_right(tb, tb->rnum[0], n_shift); /* Calculate number of bytes which must remain in body after appending to R[0] */ - if ((n_rem = - tb->insert_size[0] - - tb->rbytes) < 0) + if ((n_rem = tb->insert_size[0] - tb->rbytes) < 0) n_rem = 0; { int version; - unsigned long temp_rem = - n_rem; - - version = - ih_version - (B_N_PITEM_HEAD - (tb->R[0], 0)); - if (is_indirect_le_key - (version, - B_N_PKEY(tb->R[0], - 0))) { - temp_rem = - n_rem << - (tb->tb_sb-> - s_blocksize_bits - - - UNFM_P_SHIFT); + unsigned long temp_rem = n_rem; + + version = ih_version(B_N_PITEM_HEAD(tb->R[0], 0)); + if (is_indirect_le_key(version, B_N_PKEY(tb->R[0], 0))) { + temp_rem = n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT); } - set_le_key_k_offset - (version, - B_N_PKEY(tb->R[0], - 0), - le_key_k_offset - (version, - B_N_PKEY(tb->R[0], - 0)) + - temp_rem); - set_le_key_k_offset - (version, - B_N_PDELIM_KEY(tb-> - CFR - [0], - tb-> - rkey - [0]), - le_key_k_offset - (version, - B_N_PDELIM_KEY - (tb->CFR[0], - tb->rkey[0])) + - temp_rem); + set_le_key_k_offset(version, B_N_PKEY(tb->R[0], 0), + le_key_k_offset(version, B_N_PKEY(tb->R[0], 0)) + temp_rem); + set_le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]), + le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0])) + temp_rem); } /* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem; k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/ - do_balance_mark_internal_dirty - (tb, tb->CFR[0], 0); + do_balance_mark_internal_dirty(tb, tb->CFR[0], 0); /* Append part of body into R[0] */ buffer_info_init_right(tb, &bi); if (n_rem > zeros_num) { r_zeros_number = 0; - r_body = - body + n_rem - - zeros_num; + r_body = body + n_rem - zeros_num; } else { r_body = body; - r_zeros_number = - zeros_num - n_rem; - zeros_num -= - r_zeros_number; + r_zeros_number = zeros_num - n_rem; + zeros_num -= r_zeros_number; } - leaf_paste_in_buffer(&bi, 0, - n_shift, - tb-> - insert_size - [0] - - n_rem, - r_body, - r_zeros_number); - - if (is_indirect_le_ih - (B_N_PITEM_HEAD - (tb->R[0], 0))) { + leaf_paste_in_buffer(&bi, 0, n_shift, + tb->insert_size[0] - n_rem, + r_body, r_zeros_number); + + if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->R[0], 0))) { #if 0 RFALSE(n_rem, "PAP-12160: paste more than one unformatted node pointer"); #endif - set_ih_free_space - (B_N_PITEM_HEAD - (tb->R[0], 0), 0); + set_ih_free_space(B_N_PITEM_HEAD(tb->R[0], 0), 0); } tb->insert_size[0] = n_rem; if (!n_rem) @@ -1044,58 +722,28 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h struct item_head *pasted; - ret_val = - leaf_shift_right(tb, tb->rnum[0], - tb->rbytes); + ret_val = leaf_shift_right(tb, tb->rnum[0], tb->rbytes); /* append item in R[0] */ if (pos_in_item >= 0) { buffer_info_init_right(tb, &bi); - leaf_paste_in_buffer(&bi, - item_pos - - n + - tb-> - rnum[0], - pos_in_item, - tb-> - insert_size - [0], body, - zeros_num); + leaf_paste_in_buffer(&bi, item_pos - n + tb->rnum[0], pos_in_item, + tb->insert_size[0], body, zeros_num); } /* paste new entry, if item is directory item */ - pasted = - B_N_PITEM_HEAD(tb->R[0], - item_pos - n + - tb->rnum[0]); - if (is_direntry_le_ih(pasted) - && pos_in_item >= 0) { - leaf_paste_entries(&bi, - item_pos - - n + - tb->rnum[0], - pos_in_item, - 1, - (struct - reiserfs_de_head - *)body, - body + - DEH_SIZE, - tb-> - insert_size - [0] - ); + pasted = B_N_PITEM_HEAD(tb->R[0], item_pos - n + tb->rnum[0]); + if (is_direntry_le_ih(pasted) && pos_in_item >= 0) { + leaf_paste_entries(&bi, item_pos - n + tb->rnum[0], + pos_in_item, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); if (!pos_in_item) { - RFALSE(item_pos - n + - tb->rnum[0], + RFALSE(item_pos - n + tb->rnum[0], "PAP-12165: directory item must be first item of node when pasting is in 0th position"); /* update delimiting keys */ - replace_key(tb, - tb->CFR[0], - tb->rkey[0], - tb->R[0], - 0); + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); } } @@ -1111,22 +759,16 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h default: /* cases d and t */ reiserfs_panic(tb->tb_sb, "PAP-12175", "rnum > 0: unexpected mode: %s(%d)", - (flag == - M_DELETE) ? "DELETE" : ((flag == - M_CUT) ? "CUT" - : "UNKNOWN"), - flag); + (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); } } /* tb->rnum[0] > 0 */ RFALSE(tb->blknum[0] > 3, - "PAP-12180: blknum can not be %d. It must be <= 3", - tb->blknum[0]); + "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]); RFALSE(tb->blknum[0] < 0, - "PAP-12185: blknum can not be %d. It must be >= 0", - tb->blknum[0]); + "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]); /* if while adding to a node we discover that it is possible to split it in two, and merge the left part into the left neighbor and the @@ -1177,8 +819,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h if (n - snum[i] < item_pos) { /* new item or it's part falls to first new node S_new[i] */ if (item_pos == n - snum[i] + 1 && sbytes[i] != -1) { /* part of new item falls into S_new[i] */ - int old_key_comp, old_len, - r_zeros_number; + int old_key_comp, old_len, r_zeros_number; const char *r_body; int version; @@ -1192,15 +833,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h old_len = ih_item_len(ih); /* Calculate key component and item length to insert into S_new[i] */ - set_le_ih_k_offset(ih, - le_ih_k_offset(ih) + - ((old_len - - sbytes[i]) << - (is_indirect_le_ih - (ih) ? tb->tb_sb-> - s_blocksize_bits - - UNFM_P_SHIFT : - 0))); + set_le_ih_k_offset(ih, le_ih_k_offset(ih) + + ((old_len - sbytes[i]) << (is_indirect_le_ih(ih) ? tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT : 0))); put_ih_item_len(ih, sbytes[i]); @@ -1209,39 +843,29 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h if ((old_len - sbytes[i]) > zeros_num) { r_zeros_number = 0; - r_body = - body + (old_len - - sbytes[i]) - - zeros_num; + r_body = body + (old_len - sbytes[i]) - zeros_num; } else { r_body = body; - r_zeros_number = - zeros_num - (old_len - - sbytes[i]); + r_zeros_number = zeros_num - (old_len - sbytes[i]); zeros_num -= r_zeros_number; } - leaf_insert_into_buf(&bi, 0, ih, r_body, - r_zeros_number); + leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeros_number); /* Calculate key component and item length to insert into S[i] */ set_le_ih_k_offset(ih, old_key_comp); - put_ih_item_len(ih, - old_len - sbytes[i]); + put_ih_item_len(ih, old_len - sbytes[i]); tb->insert_size[0] -= sbytes[i]; } else { /* whole new item falls into S_new[i] */ /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, - snum[i] - 1, sbytes[i], - S_new[i]); + snum[i] - 1, sbytes[i], S_new[i]); /* Insert new item into S_new[i] */ buffer_info_init_bh(tb, &bi, S_new[i]); - leaf_insert_into_buf(&bi, - item_pos - n + - snum[i] - 1, ih, - body, zeros_num); + leaf_insert_into_buf(&bi, item_pos - n + snum[i] - 1, + ih, body, zeros_num); zeros_num = tb->insert_size[0] = 0; } @@ -1268,150 +892,73 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h int entry_count; - entry_count = - ih_entry_count(aux_ih); + entry_count = ih_entry_count(aux_ih); - if (entry_count - sbytes[i] < - pos_in_item - && pos_in_item <= - entry_count) { + if (entry_count - sbytes[i] < pos_in_item && pos_in_item <= entry_count) { /* new directory entry falls into S_new[i] */ - RFALSE(!tb-> - insert_size[0], - "PAP-12215: insert_size is already 0"); - RFALSE(sbytes[i] - 1 >= - entry_count, + RFALSE(!tb->insert_size[0], "PAP-12215: insert_size is already 0"); + RFALSE(sbytes[i] - 1 >= entry_count, "PAP-12220: there are no so much entries (%d), only %d", - sbytes[i] - 1, - entry_count); + sbytes[i] - 1, entry_count); /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */ - leaf_move_items - (LEAF_FROM_S_TO_SNEW, - tb, snum[i], - sbytes[i] - 1, - S_new[i]); + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i] - 1, S_new[i]); /* Paste given directory entry to directory item */ buffer_info_init_bh(tb, &bi, S_new[i]); - leaf_paste_in_buffer - (&bi, 0, - pos_in_item - - entry_count + - sbytes[i] - 1, - tb->insert_size[0], - body, zeros_num); + leaf_paste_in_buffer(&bi, 0, pos_in_item - entry_count + sbytes[i] - 1, + tb->insert_size[0], body, zeros_num); /* paste new directory entry */ - leaf_paste_entries(&bi, - 0, - pos_in_item - - - entry_count - + - sbytes - [i] - - 1, 1, - (struct - reiserfs_de_head - *) - body, - body - + - DEH_SIZE, - tb-> - insert_size - [0] - ); + leaf_paste_entries(&bi, 0, pos_in_item - entry_count + sbytes[i] - 1, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); tb->insert_size[0] = 0; pos_in_item++; } else { /* new directory entry doesn't fall into S_new[i] */ - leaf_move_items - (LEAF_FROM_S_TO_SNEW, - tb, snum[i], - sbytes[i], - S_new[i]); + leaf_move_items(LEAF_FROM_S_TO_SNEW,tb, snum[i], sbytes[i], S_new[i]); } } else { /* regular object */ - int n_shift, n_rem, - r_zeros_number; + int n_shift, n_rem, r_zeros_number; const char *r_body; - RFALSE(pos_in_item != - ih_item_len - (B_N_PITEM_HEAD - (tbS0, item_pos)) - || tb->insert_size[0] <= - 0, + RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)) || tb->insert_size[0] <= 0, "PAP-12225: item too short or insert_size <= 0"); /* Calculate number of bytes which must be shifted from appended item */ - n_shift = - sbytes[i] - - tb->insert_size[0]; + n_shift = sbytes[i] - tb->insert_size[0]; if (n_shift < 0) n_shift = 0; - leaf_move_items - (LEAF_FROM_S_TO_SNEW, tb, - snum[i], n_shift, - S_new[i]); + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, snum[i], n_shift, S_new[i]); /* Calculate number of bytes which must remain in body after append to S_new[i] */ - n_rem = - tb->insert_size[0] - - sbytes[i]; + n_rem = tb->insert_size[0] - sbytes[i]; if (n_rem < 0) n_rem = 0; /* Append part of body into S_new[0] */ buffer_info_init_bh(tb, &bi, S_new[i]); if (n_rem > zeros_num) { r_zeros_number = 0; - r_body = - body + n_rem - - zeros_num; + r_body = body + n_rem - zeros_num; } else { r_body = body; - r_zeros_number = - zeros_num - n_rem; - zeros_num -= - r_zeros_number; + r_zeros_number = zeros_num - n_rem; + zeros_num -= r_zeros_number; } - leaf_paste_in_buffer(&bi, 0, - n_shift, - tb-> - insert_size - [0] - - n_rem, - r_body, - r_zeros_number); + leaf_paste_in_buffer(&bi, 0, n_shift, + tb->insert_size[0] - n_rem, + r_body, r_zeros_number); { struct item_head *tmp; - tmp = - B_N_PITEM_HEAD(S_new - [i], - 0); + tmp = B_N_PITEM_HEAD(S_new[i], 0); if (is_indirect_le_ih (tmp)) { - set_ih_free_space - (tmp, 0); - set_le_ih_k_offset - (tmp, - le_ih_k_offset - (tmp) + - (n_rem << - (tb-> - tb_sb-> - s_blocksize_bits - - - UNFM_P_SHIFT))); + set_ih_free_space(tmp, 0); + set_le_ih_k_offset(tmp, le_ih_k_offset(tmp) + (n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT))); } else { - set_le_ih_k_offset - (tmp, - le_ih_k_offset - (tmp) + - n_rem); + set_le_ih_k_offset(tmp, le_ih_k_offset(tmp) + n_rem); } } @@ -1426,8 +973,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h struct item_head *pasted; #ifdef CONFIG_REISERFS_CHECK - struct item_head *ih_check = - B_N_PITEM_HEAD(tbS0, item_pos); + struct item_head *ih_check = B_N_PITEM_HEAD(tbS0, item_pos); if (!is_direntry_le_ih(ih_check) && (pos_in_item != ih_item_len(ih_check) @@ -1439,8 +985,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h "to ih_item_len"); #endif /* CONFIG_REISERFS_CHECK */ - leaf_mi = - leaf_move_items(LEAF_FROM_S_TO_SNEW, + leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); @@ -1452,30 +997,19 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h /* paste into item */ buffer_info_init_bh(tb, &bi, S_new[i]); leaf_paste_in_buffer(&bi, - item_pos - n + - snum[i], + item_pos - n + snum[i], pos_in_item, tb->insert_size[0], body, zeros_num); - pasted = - B_N_PITEM_HEAD(S_new[i], - item_pos - n + - snum[i]); + pasted = B_N_PITEM_HEAD(S_new[i], item_pos - n + snum[i]); if (is_direntry_le_ih(pasted)) { leaf_paste_entries(&bi, - item_pos - - n + snum[i], - pos_in_item, - 1, - (struct - reiserfs_de_head - *)body, - body + - DEH_SIZE, - tb-> - insert_size - [0] + item_pos - n + snum[i], + pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, + tb->insert_size[0] ); } @@ -1495,11 +1029,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h default: /* cases d and t */ reiserfs_panic(tb->tb_sb, "PAP-12245", "blknum > 2: unexpected mode: %s(%d)", - (flag == - M_DELETE) ? "DELETE" : ((flag == - M_CUT) ? "CUT" - : "UNKNOWN"), - flag); + (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); } memcpy(insert_key + i, B_N_PKEY(S_new[i], 0), KEY_SIZE); @@ -1524,9 +1054,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h /* If we insert the first key change the delimiting key */ if (item_pos == 0) { if (tb->CFL[0]) /* can be 0 in reiserfsck */ - replace_key(tb, tb->CFL[0], tb->lkey[0], - tbS0, 0); - + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); } break; @@ -1536,53 +1064,27 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h pasted = B_N_PITEM_HEAD(tbS0, item_pos); /* when directory, may be new entry already pasted */ if (is_direntry_le_ih(pasted)) { - if (pos_in_item >= 0 && - pos_in_item <= - ih_entry_count(pasted)) { + if (pos_in_item >= 0 && pos_in_item <= ih_entry_count(pasted)) { RFALSE(!tb->insert_size[0], "PAP-12260: insert_size is 0 already"); /* prepare space */ buffer_info_init_tbS0(tb, &bi); - leaf_paste_in_buffer(&bi, - item_pos, - pos_in_item, - tb-> - insert_size - [0], body, + leaf_paste_in_buffer(&bi, item_pos, pos_in_item, + tb->insert_size[0], body, zeros_num); /* paste entry */ - leaf_paste_entries(&bi, - item_pos, - pos_in_item, - 1, - (struct - reiserfs_de_head - *)body, - body + - DEH_SIZE, - tb-> - insert_size - [0] - ); + leaf_paste_entries(&bi, item_pos, pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, + tb->insert_size[0]); if (!item_pos && !pos_in_item) { - RFALSE(!tb->CFL[0] - || !tb->L[0], + RFALSE(!tb->CFL[0] || !tb->L[0], "PAP-12270: CFL[0]/L[0] must be specified"); - if (tb->CFL[0]) { - replace_key(tb, - tb-> - CFL - [0], - tb-> - lkey - [0], - tbS0, - 0); - - } + if (tb->CFL[0]) + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); } tb->insert_size[0] = 0; } @@ -1593,13 +1095,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h "PAP-12275: insert size must not be %d", tb->insert_size[0]); buffer_info_init_tbS0(tb, &bi); - leaf_paste_in_buffer(&bi, - item_pos, - pos_in_item, - tb-> - insert_size - [0], body, - zeros_num); + leaf_paste_in_buffer(&bi, item_pos, pos_in_item, + tb->insert_size[0], body, zeros_num); if (is_indirect_le_ih(pasted)) { #if 0 @@ -1611,8 +1108,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h tb-> insert_size[0]); #endif - set_ih_free_space - (pasted, 0); + set_ih_free_space(pasted, 0); } tb->insert_size[0] = 0; } @@ -1620,8 +1116,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h else { if (tb->insert_size[0]) { print_cur_tb("12285"); - reiserfs_panic(tb-> - tb_sb, + reiserfs_panic(tb->tb_sb, "PAP-12285", "insert_size " "must be 0 " diff --git a/fs/super.c b/fs/super.c index cecd780e0f44..80d5cf2ca765 100644 --- a/fs/super.c +++ b/fs/super.c @@ -703,7 +703,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (flags & MS_RDONLY) acct_auto_close(sb); shrink_dcache_sb(sb); - sync_filesystem(sb); remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); @@ -720,6 +719,8 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) } } + sync_filesystem(sb); + if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); if (retval) { diff --git a/fs/sync.c b/fs/sync.c index f15537452231..b28d1dd10e8b 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -27,11 +27,10 @@ * wait == 1 case since in that case write_inode() functions do * sync_dirty_buffer() and thus effectively write one block at a time. */ -static int __sync_filesystem(struct super_block *sb, int wait, - unsigned long start) +static int __sync_filesystem(struct super_block *sb, int wait) { if (wait) - sync_inodes_sb(sb, start); + sync_inodes_sb(sb); else writeback_inodes_sb(sb, WB_REASON_SYNC); @@ -48,7 +47,6 @@ static int __sync_filesystem(struct super_block *sb, int wait, int sync_filesystem(struct super_block *sb) { int ret; - unsigned long start = jiffies; /* * We need to be protected against the filesystem going from @@ -62,17 +60,17 @@ int sync_filesystem(struct super_block *sb) if (sb->s_flags & MS_RDONLY) return 0; - ret = __sync_filesystem(sb, 0, start); + ret = __sync_filesystem(sb, 0); if (ret < 0) return ret; - return __sync_filesystem(sb, 1, start); + return __sync_filesystem(sb, 1); } EXPORT_SYMBOL_GPL(sync_filesystem); static void sync_inodes_one_sb(struct super_block *sb, void *arg) { if (!(sb->s_flags & MS_RDONLY)) - sync_inodes_sb(sb, *((unsigned long *)arg)); + sync_inodes_sb(sb); } static void sync_fs_one_sb(struct super_block *sb, void *arg) @@ -104,10 +102,9 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg) SYSCALL_DEFINE0(sync) { int nowait = 0, wait = 1; - unsigned long start = jiffies; wakeup_flusher_threads(0, WB_REASON_SYNC); - iterate_supers(sync_inodes_one_sb, &start); + iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &wait); iterate_bdevs(fdatawrite_one_bdev, NULL); @@ -222,23 +219,6 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) return do_fsync(fd, 1); } -/** - * generic_write_sync - perform syncing after a write if file / inode is sync - * @file: file to which the write happened - * @pos: offset where the write started - * @count: length of the write - * - * This is just a simple wrapper about our general syncing function. - */ -int generic_write_sync(struct file *file, loff_t pos, loff_t count) -{ - if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) - return 0; - return vfs_fsync_range(file, pos, pos + count - 1, - (file->f_flags & __O_SYNC) ? 0 : 1); -} -EXPORT_SYMBOL(generic_write_sync); - /* * sys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 6211230814fd..3eaf5c6622eb 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -27,6 +27,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, { struct dentry *root; void *ns; + bool new_sb; if (!(flags & MS_KERNMOUNT)) { if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) @@ -37,8 +38,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, } ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); - root = kernfs_mount_ns(fs_type, flags, sysfs_root, ns); - if (IS_ERR(root)) + root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns); + if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); return root; } diff --git a/fs/udf/file.c b/fs/udf/file.c index c02a27a19c6d..1037637957c7 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -144,6 +144,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, size_t count = iocb->ki_nbytes; struct udf_inode_info *iinfo = UDF_I(inode); + mutex_lock(&inode->i_mutex); down_write(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { if (file->f_flags & O_APPEND) @@ -156,6 +157,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, pos + count)) { err = udf_expand_file_adinicb(inode); if (err) { + mutex_unlock(&inode->i_mutex); udf_debug("udf_expand_adinicb: err=%d\n", err); return err; } @@ -169,9 +171,17 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } else up_write(&iinfo->i_data_sem); - retval = generic_file_aio_write(iocb, iov, nr_segs, ppos); - if (retval > 0) + retval = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + mutex_unlock(&inode->i_mutex); + + if (retval > 0) { + ssize_t err; + mark_inode_dirty(inode); + err = generic_write_sync(file, iocb->ki_pos - retval, retval); + if (err < 0) + retval = err; + } return retval; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 062b7925bca0..982ce05c87ed 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -265,6 +265,7 @@ int udf_expand_file_adinicb(struct inode *inode) .nr_to_write = 1, }; + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); if (!iinfo->i_lenAlloc) { if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a26739451b53..db2cfb067d0b 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -407,7 +407,7 @@ xfs_alloc_ioend_bio( struct bio *bio = bio_alloc(GFP_NOIO, nvecs); ASSERT(bio->bi_private == NULL); - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; return bio; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 51757113a822..9c061ef2b0d9 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1240,7 +1240,7 @@ next_chunk: bio = bio_alloc(GFP_NOIO, nr_pages); bio->bi_bdev = bp->b_target->bt_bdev; - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; @@ -1262,7 +1262,7 @@ next_chunk: total_nr_pages--; } - if (likely(bio->bi_size)) { + if (likely(bio->bi_iter.bi_size)) { if (xfs_buf_is_vmapped(bp)) { flush_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2e7989e3a2d6..64b48eade91d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -799,7 +799,7 @@ xfs_file_aio_write( XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ - err = generic_write_sync(file, pos, ret); + err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f35d5c953ff9..9ddfb8190ca1 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -705,7 +705,6 @@ xfs_setattr_size( { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); - int mask = iattr->ia_valid; xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; @@ -726,8 +725,8 @@ xfs_setattr_size( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(S_ISREG(ip->i_d.di_mode)); - ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| - ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); + ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); oldsize = inode->i_size; newsize = iattr->ia_size; @@ -736,7 +735,7 @@ xfs_setattr_size( * Short circuit the truncate case for zero length files. */ if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { - if (!(mask & (ATTR_CTIME|ATTR_MTIME))) + if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) return 0; /* @@ -824,10 +823,11 @@ xfs_setattr_size( * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + if (newsize != oldsize && + !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); - mask |= ATTR_CTIME | ATTR_MTIME; + iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; } /* @@ -863,9 +863,9 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } - if (mask & ATTR_MODE) + if (iattr->ia_valid & ATTR_MODE) xfs_setattr_mode(ip, iattr); - if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) xfs_setattr_time(ip, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index cdebd832c3db..4ef6fdbced78 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -205,16 +205,25 @@ xlog_cil_insert_format_items( /* * We 64-bit align the length of each iovec so that the start * of the next one is naturally aligned. We'll need to - * account for that slack space here. + * account for that slack space here. Then round nbytes up + * to 64-bit alignment so that the initial buffer alignment is + * easy to calculate and verify. */ nbytes += niovecs * sizeof(uint64_t); + nbytes = round_up(nbytes, sizeof(uint64_t)); /* grab the old item if it exists for reservation accounting */ old_lv = lip->li_lv; - /* calc buffer size */ - buf_size = sizeof(struct xfs_log_vec) + nbytes + - niovecs * sizeof(struct xfs_log_iovec); + /* + * The data buffer needs to start 64-bit aligned, so round up + * that space to ensure we can align it appropriately and not + * overrun the buffer. + */ + buf_size = nbytes + + round_up((sizeof(struct xfs_log_vec) + + niovecs * sizeof(struct xfs_log_iovec)), + sizeof(uint64_t)); /* compare to existing item size */ if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { @@ -251,6 +260,8 @@ xlog_cil_insert_format_items( /* The allocated data region lies beyond the iovec region */ lv->lv_buf_len = 0; lv->lv_buf = (char *)lv + buf_size - nbytes; + ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); + lip->li_ops->iop_format(lip, lv); insert: ASSERT(lv->lv_buf_len <= nbytes); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 02df7b408a26..f96c05669a9e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -282,22 +282,29 @@ xfs_readsb( struct xfs_sb *sbp = &mp->m_sb; int error; int loud = !(flags & XFS_MFSI_QUIET); + const struct xfs_buf_ops *buf_ops; ASSERT(mp->m_sb_bp == NULL); ASSERT(mp->m_ddev_targp != NULL); /* + * For the initial read, we must guess at the sector + * size based on the block device. It's enough to + * get the sb_sectsize out of the superblock and + * then reread with the proper length. + * We don't verify it yet, because it may not be complete. + */ + sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); + buf_ops = NULL; + + /* * Allocate a (locked) buffer to hold the superblock. * This will be kept around at all times to optimize * access to the superblock. */ - sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); - reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, - loud ? &xfs_sb_buf_ops - : &xfs_sb_quiet_buf_ops); + BTOBB(sector_size), 0, buf_ops); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -328,12 +335,13 @@ reread: } /* - * If device sector size is smaller than the superblock size, - * re-read the superblock so the buffer is correctly sized. + * Re-read the superblock so the buffer is correctly sized, + * and properly verified. */ - if (sector_size < sbp->sb_sectsize) { + if (buf_ops == NULL) { xfs_buf_relse(bp); sector_size = sbp->sb_sectsize; + buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops; goto reread; } diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c index b7c9aea77f8f..1e116794bb66 100644 --- a/fs/xfs/xfs_sb.c +++ b/fs/xfs/xfs_sb.c @@ -295,8 +295,7 @@ xfs_mount_validate_sb( sbp->sb_dblocks == 0 || sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { - XFS_CORRUPTION_ERROR("SB sanity check failed", - XFS_ERRLEVEL_LOW, mp, sbp); + xfs_notice(mp, "SB sanity check failed"); return XFS_ERROR(EFSCORRUPTED); } @@ -611,10 +610,10 @@ xfs_sb_read_verify( XFS_SB_VERSION_5) || dsb->sb_crc != 0)) { - if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize), + if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), offsetof(struct xfs_sb, sb_crc))) { /* Only fail bad secondaries on a known V5 filesystem */ - if (bp->b_bn != XFS_SB_DADDR && + if (bp->b_bn == XFS_SB_DADDR || xfs_sb_version_hascrc(&mp->m_sb)) { error = EFSCORRUPTED; goto out_error; @@ -625,7 +624,7 @@ xfs_sb_read_verify( out_error: if (error) { - if (error != EWRONGFS) + if (error == EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, error); @@ -644,7 +643,6 @@ xfs_sb_quiet_read_verify( { struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); - if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { /* XFS filesystem, verify noisily! */ xfs_sb_read_verify(bp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f317488263dd..d971f4932b5d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -913,7 +913,7 @@ xfs_flush_inodes( struct super_block *sb = mp->m_super; if (down_read_trylock(&sb->s_umount)) { - sync_inodes_sb(sb, jiffies); + sync_inodes_sb(sb); up_read(&sb->s_umount); } } |