Merge tag 'fsverity-for-linus' of git://git.kernel.org/pub/scm/fs/fsverity/linux

Pull fsverity updates from Eric Biggers: "Fix the longstanding implementation limitation that fsverity was only supported when the Merkle tree block size, filesystem block size, and PAGE_SIZE were all equal. Specifically, add support for Merkle tree block sizes less than PAGE_SIZE, and make ext4 support fsverity on filesystems where the filesystem block size is less than PAGE_SIZE. Effectively, this means that fsverity can now be used on systems with non-4K pages, at least on ext4. These changes have been tested using the verity group of xfstests, newly updated to cover the new code paths. Also update fs/verity/ to support verifying data from large folios. There's also a similar patch for fs/crypto/, to support decrypting data from large folios, which I'm including in here to avoid a merge conflict between the fscrypt and fsverity branches" * tag 'fsverity-for-linus' of git://git.kernel.org/pub/scm/fs/fsverity/linux: fscrypt: support decrypting data from large folios fsverity: support verifying data from large folios fsverity.rst: update git repo URL for fsverity-utils ext4: allow verity with fs block size < PAGE_SIZE fs/buffer.c: support fsverity in block_read_full_folio() f2fs: simplify f2fs_readpage_limit() ext4: simplify ext4_readpage_limit() fsverity: support enabling with tree block size < PAGE_SIZE fsverity: support verification with tree block size < PAGE_SIZE fsverity: replace fsverity_hash_page() with fsverity_hash_block() fsverity: use EFBIG for file too large to enable verity fsverity: store log2(digest_size) precomputed fsverity: simplify Merkle tree readahead size calculation fsverity: use unsigned long for level_start fsverity: remove debug messages and CONFIG_FS_VERITY_DEBUG fsverity: pass pos and size to ->write_merkle_tree_block fsverity: optimize fsverity_cleanup_inode() on non-verity files fsverity: optimize fsverity_prepare_setattr() on non-verity files fsverity: optimize fsverity_file_open() on non-verity files
author: Linus Torvalds <torvalds@linux-foundation.org> 2023-02-20 12:33:41 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2023-02-20 12:33:41 -0800
commit: 6639c3ce7fd217c22b26aa9f2a3cb69dc19221f8 (patch)
tree: 743eadc88bc0422c227484805f97d2b23b21fb3b /fs
parent: f18f9845f2f10d3d1fc63e4ad16ee52d2d9292fa (diff)
parent: 51e4e3153ebc32d3280d5d17418ae6f1a44f1ec1 (diff)
18 files changed, 569 insertions, 428 deletions
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index bf9eb693a6a7..c5ff16f9e9fa 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -783,30 +783,25 @@ again:
 /*
  * fsverity op that writes a Merkle tree block into the btree.
  *
- * @inode:          inode to write a Merkle tree block for
- * @buf:            Merkle tree data block to write
- * @index:          index of the block in the Merkle tree
- * @log_blocksize:  log base 2 of the Merkle tree block size
- *
- * Note that the block size could be different from the page size, so it is not
- * safe to assume that index is a page index.
+ * @inode:	inode to write a Merkle tree block for
+ * @buf:	Merkle tree block to write
+ * @pos:	the position of the block in the Merkle tree (in bytes)
+ * @size:	the Merkle tree block size (in bytes)
  *
  * Returns 0 on success or negative error code on failure
  */
 static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
-					u64 index, int log_blocksize)
+					 u64 pos, unsigned int size)
 {
-	u64 off = index << log_blocksize;
-	u64 len = 1ULL << log_blocksize;
 	loff_t merkle_pos = merkle_file_pos(inode);
 
 	if (merkle_pos < 0)
 		return merkle_pos;
-	if (merkle_pos > inode->i_sb->s_maxbytes - off - len)
+	if (merkle_pos > inode->i_sb->s_maxbytes - pos - size)
 		return -EFBIG;
 
 	return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY,
-			       off, buf, len);
+			       pos, buf, size);
 }
 
 const struct fsverity_operations btrfs_verityops = {
diff --git a/fs/buffer.c b/fs/buffer.c
index d9c6d1fbb6dd..623e77d6ef77 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -48,6 +48,7 @@
 #include <linux/sched/mm.h>
 #include <trace/events/block.h>
 #include <linux/fscrypt.h>
+#include <linux/fsverity.h>
 
 #include "internal.h"
 
@@ -295,20 +296,53 @@ still_busy:
 	return;
 }
 
-struct decrypt_bh_ctx {
+struct postprocess_bh_ctx {
 	struct work_struct work;
 	struct buffer_head *bh;
 };
 
+static void verify_bh(struct work_struct *work)
+{
+	struct postprocess_bh_ctx *ctx =
+		container_of(work, struct postprocess_bh_ctx, work);
+	struct buffer_head *bh = ctx->bh;
+	bool valid;
+
+	valid = fsverity_verify_blocks(page_folio(bh->b_page), bh->b_size,
+				       bh_offset(bh));
+	end_buffer_async_read(bh, valid);
+	kfree(ctx);
+}
+
+static bool need_fsverity(struct buffer_head *bh)
+{
+	struct page *page = bh->b_page;
+	struct inode *inode = page->mapping->host;
+
+	return fsverity_active(inode) &&
+		/* needed by ext4 */
+		page->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
 static void decrypt_bh(struct work_struct *work)
 {
-	struct decrypt_bh_ctx *ctx =
-		container_of(work, struct decrypt_bh_ctx, work);
+	struct postprocess_bh_ctx *ctx =
+		container_of(work, struct postprocess_bh_ctx, work);
 	struct buffer_head *bh = ctx->bh;
 	int err;
 
-	err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size,
-					       bh_offset(bh));
+	err = fscrypt_decrypt_pagecache_blocks(page_folio(bh->b_page),
+					       bh->b_size, bh_offset(bh));
+	if (err == 0 && need_fsverity(bh)) {
+		/*
+		 * We use different work queues for decryption and for verity
+		 * because verity may require reading metadata pages that need
+		 * decryption, and we shouldn't recurse to the same workqueue.
+		 */
+		INIT_WORK(&ctx->work, verify_bh);
+		fsverity_enqueue_verify_work(&ctx->work);
+		return;
+	}
 	end_buffer_async_read(bh, err == 0);
 	kfree(ctx);
 }
@@ -319,15 +353,24 @@ static void decrypt_bh(struct work_struct *work)
  */
 static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
 {
-	/* Decrypt if needed */
-	if (uptodate &&
-	    fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) {
-		struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
+	struct inode *inode = bh->b_page->mapping->host;
+	bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
+	bool verify = need_fsverity(bh);
+
+	/* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
+	if (uptodate && (decrypt || verify)) {
+		struct postprocess_bh_ctx *ctx =
+			kmalloc(sizeof(*ctx), GFP_ATOMIC);
 
 		if (ctx) {
-			INIT_WORK(&ctx->work, decrypt_bh);
 			ctx->bh = bh;
-			fscrypt_enqueue_decrypt_work(&ctx->work);
+			if (decrypt) {
+				INIT_WORK(&ctx->work, decrypt_bh);
+				fscrypt_enqueue_decrypt_work(&ctx->work);
+			} else {
+				INIT_WORK(&ctx->work, verify_bh);
+				fsverity_enqueue_verify_work(&ctx->work);
+			}
 			return;
 		}
 		uptodate = 0;
@@ -2245,6 +2288,11 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
 	int nr, i;
 	int fully_mapped = 1;
 	bool page_error = false;
+	loff_t limit = i_size_read(inode);
+
+	/* This is needed for ext4. */
+	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
+		limit = inode->i_sb->s_maxbytes;
 
 	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
 
@@ -2253,7 +2301,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
 	bbits = block_size_bits(blocksize);
 
 	iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits);
-	lblock = (i_size_read(inode)+blocksize-1) >> bbits;
+	lblock = (limit+blocksize-1) >> bbits;
 	bh = head;
 	nr = 0;
 	i = 0;
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 1b4403136d05..d57d0a020f71 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -30,13 +30,11 @@
  */
 bool fscrypt_decrypt_bio(struct bio *bio)
 {
-	struct bio_vec *bv;
-	struct bvec_iter_all iter_all;
+	struct folio_iter fi;
 
-	bio_for_each_segment_all(bv, bio, iter_all) {
-		struct page *page = bv->bv_page;
-		int err = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len,
-							   bv->bv_offset);
+	bio_for_each_folio_all(fi, bio) {
+		int err = fscrypt_decrypt_pagecache_blocks(fi.folio, fi.length,
+							   fi.offset);
 
 		if (err) {
 			bio->bi_status = errno_to_blk_status(err);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index e78be66bbf01..bf642479269a 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -237,41 +237,43 @@ EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
 
 /**
  * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a
- *					pagecache page
- * @page:      The locked pagecache page containing the block(s) to decrypt
+ *					pagecache folio
+ * @folio:     The locked pagecache folio containing the block(s) to decrypt
  * @len:       Total size of the block(s) to decrypt.  Must be a nonzero
  *		multiple of the filesystem's block size.
- * @offs:      Byte offset within @page of the first block to decrypt.  Must be
+ * @offs:      Byte offset within @folio of the first block to decrypt.  Must be
  *		a multiple of the filesystem's block size.
  *
- * The specified block(s) are decrypted in-place within the pagecache page,
- * which must still be locked and not uptodate.  Normally, blocksize ==
- * PAGE_SIZE and the whole page is decrypted at once.
+ * The specified block(s) are decrypted in-place within the pagecache folio,
+ * which must still be locked and not uptodate.
  *
  * This is for use by the filesystem's ->readahead() method.
  *
  * Return: 0 on success; -errno on failure
  */
-int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len,
-				     unsigned int offs)
+int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
+				     size_t offs)
 {
-	const struct inode *inode = page->mapping->host;
+	const struct inode *inode = folio->mapping->host;
 	const unsigned int blockbits = inode->i_blkbits;
 	const unsigned int blocksize = 1 << blockbits;
-	u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) +
+	u64 lblk_num = ((u64)folio->index << (PAGE_SHIFT - blockbits)) +
 		       (offs >> blockbits);
-	unsigned int i;
+	size_t i;
 	int err;
 
-	if (WARN_ON_ONCE(!PageLocked(page)))
+	if (WARN_ON_ONCE(!folio_test_locked(folio)))
 		return -EINVAL;
 
 	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
 		return -EINVAL;
 
 	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
+		struct page *page = folio_page(folio, i >> PAGE_SHIFT);
+
 		err = fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page,
-					  page, blocksize, i, GFP_NOFS);
+					  page, blocksize, i & ~PAGE_MASK,
+					  GFP_NOFS);
 		if (err)
 			return err;
 	}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be664dc9b991..b936ee3af51e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1136,7 +1136,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 		for (i = 0; i < nr_wait; i++) {
 			int err2;
 
-			err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize,
+			err2 = fscrypt_decrypt_pagecache_blocks(page_folio(page),
+								blocksize,
 								bh_offset(wait[i]));
 			if (err2) {
 				clear_buffer_uptodate(wait[i]);
@@ -3858,7 +3859,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 		if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
 			/* We expect the key to be set. */
 			BUG_ON(!fscrypt_has_encryption_key(inode));
-			err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
+			err = fscrypt_decrypt_pagecache_blocks(page_folio(page),
+							       blocksize,
 							       bh_offset(bh));
 			if (err) {
 				clear_buffer_uptodate(bh);
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index d5266932ce6c..c61dc8a7c014 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -211,8 +211,7 @@ static void ext4_set_bio_post_read_ctx(struct bio *bio,
 
 static inline loff_t ext4_readpage_limit(struct inode *inode)
 {
-	if (IS_ENABLED(CONFIG_FS_VERITY) &&
-	    (IS_VERITY(inode) || ext4_verity_in_progress(inode)))
+	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
 		return inode->i_sb->s_maxbytes;
 
 	return i_size_read(inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 260bbab25db3..2ae46d11aa30 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5325,11 +5325,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 		}
 	}
 
-	if (ext4_has_feature_verity(sb) && sb->s_blocksize != PAGE_SIZE) {
-		ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
-		goto failed_mount_wq;
-	}
-
 	/*
 	 * Get the # of file system overhead blocks from the
 	 * superblock if present.
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 30e3b65798b5..e4da1704438e 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -381,11 +381,11 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
 }
 
 static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
-					u64 index, int log_blocksize)
+					u64 pos, unsigned int size)
 {
-	loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize);
+	pos += ext4_verity_metadata_pos(inode);
 
-	return pagecache_write(inode, buf, 1 << log_blocksize, pos);
+	return pagecache_write(inode, buf, size, pos);
 }
 
 const struct fsverity_operations ext4_verityops = {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 97e816590cd9..8630df80fedb 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2053,8 +2053,7 @@ out:
 
 static inline loff_t f2fs_readpage_limit(struct inode *inode)
 {
-	if (IS_ENABLED(CONFIG_FS_VERITY) &&
-	    (IS_VERITY(inode) || f2fs_verity_in_progress(inode)))
+	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
 		return inode->i_sb->s_maxbytes;
 
 	return i_size_read(inode);
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index c352fff88a5e..f320ed8172ec 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -276,11 +276,11 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
 }
 
 static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
-					u64 index, int log_blocksize)
+					u64 pos, unsigned int size)
 {
-	loff_t pos = f2fs_verity_metadata_pos(inode) + (index << log_blocksize);
+	pos += f2fs_verity_metadata_pos(inode);
 
-	return pagecache_write(inode, buf, 1 << log_blocksize, pos);
+	return pagecache_write(inode, buf, size, pos);
 }
 
 const struct fsverity_operations f2fs_verityops = {
diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig
index aad1f1d998b9..a7ffd718f171 100644
--- a/fs/verity/Kconfig
+++ b/fs/verity/Kconfig
@@ -34,14 +34,6 @@ config FS_VERITY
 
 	  If unsure, say N.
 
-config FS_VERITY_DEBUG
-	bool "FS Verity debugging"
-	depends on FS_VERITY
-	help
-	  Enable debugging messages related to fs-verity by default.
-
-	  Say N unless you are an fs-verity developer.
-
 config FS_VERITY_BUILTIN_SIGNATURES
 	bool "FS Verity builtin signature support"
 	depends on FS_VERITY
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index df6b499bf6a1..e13db6507b38 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -7,136 +7,50 @@
 
 #include "fsverity_private.h"
 
-#include <crypto/hash.h>
-#include <linux/backing-dev.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 
-/*
- * Read a file data page for Merkle tree construction.  Do aggressive readahead,
- * since we're sequentially reading the entire file.
- */
-static struct page *read_file_data_page(struct file *file, pgoff_t index,
-					struct file_ra_state *ra,
-					unsigned long remaining_pages)
-{
-	DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, index);
-	struct folio *folio;
-
-	folio = __filemap_get_folio(ractl.mapping, index, FGP_ACCESSED, 0);
-	if (!folio || !folio_test_uptodate(folio)) {
-		if (folio)
-			folio_put(folio);
-		else
-			page_cache_sync_ra(&ractl, remaining_pages);
-		folio = read_cache_folio(ractl.mapping, index, NULL, file);
-		if (IS_ERR(folio))
-			return &folio->page;
-	}
-	if (folio_test_readahead(folio))
-		page_cache_async_ra(&ractl, folio, remaining_pages);
-	return folio_file_page(folio, index);
-}
+struct block_buffer {
+	u32 filled;
+	u8 *data;
+};
 
-static int build_merkle_tree_level(struct file *filp, unsigned int level,
-				   u64 num_blocks_to_hash,
-				   const struct merkle_tree_params *params,
-				   u8 *pending_hashes,
-				   struct ahash_request *req)
+/* Hash a block, writing the result to the next level's pending block buffer. */
+static int hash_one_block(struct inode *inode,
+			  const struct merkle_tree_params *params,
+			  struct ahash_request *req, struct block_buffer *cur)
 {
-	struct inode *inode = file_inode(filp);
-	const struct fsverity_operations *vops = inode->i_sb->s_vop;
-	struct file_ra_state ra = { 0 };
-	unsigned int pending_size = 0;
-	u64 dst_block_num;
-	u64 i;
+	struct block_buffer *next = cur + 1;
 	int err;
 
-	if (WARN_ON(params->block_size != PAGE_SIZE)) /* checked earlier too */
-		return -EINVAL;
-
-	if (level < params->num_levels) {
-		dst_block_num = params->level_start[level];
-	} else {
-		if (WARN_ON(num_blocks_to_hash != 1))
-			return -EINVAL;
-		dst_block_num = 0; /* unused */
-	}
+	/* Zero-pad the block if it's shorter than the block size. */
+	memset(&cur->data[cur->filled], 0, params->block_size - cur->filled);
 
-	file_ra_state_init(&ra, filp->f_mapping);
-
-	for (i = 0; i < num_blocks_to_hash; i++) {
-		struct page *src_page;
-
-		if ((pgoff_t)i % 10000 == 0 || i + 1 == num_blocks_to_hash)
-			pr_debug("Hashing block %llu of %llu for level %u\n",
-				 i + 1, num_blocks_to_hash, level);
-
-		if (level == 0) {
-			/* Leaf: hashing a data block */
-			src_page = read_file_data_page(filp, i, &ra,
-						       num_blocks_to_hash - i);
-			if (IS_ERR(src_page)) {
-				err = PTR_ERR(src_page);
-				fsverity_err(inode,
-					     "Error %d reading data page %llu",
-					     err, i);
-				return err;
-			}
-		} else {
-			unsigned long num_ra_pages =
-				min_t(unsigned long, num_blocks_to_hash - i,
-				      inode->i_sb->s_bdi->io_pages);
-
-			/* Non-leaf: hashing hash block from level below */
-			src_page = vops->read_merkle_tree_page(inode,
-					params->level_start[level - 1] + i,
-					num_ra_pages);
-			if (IS_ERR(src_page)) {
-				err = PTR_ERR(src_page);
-				fsverity_err(inode,
-					     "Error %d reading Merkle tree page %llu",
-					     err, params->level_start[level - 1] + i);
-				return err;
-			}
-		}
+	err = fsverity_hash_block(params, inode, req, virt_to_page(cur->data),
+				  offset_in_page(cur->data),
+				  &next->data[next->filled]);
+	if (err)
+		return err;
+	next->filled += params->digest_size;
+	cur->filled = 0;
+	return 0;
+}
 
-		err = fsverity_hash_page(params, inode, req, src_page,
-					 &pending_hashes[pending_size]);
-		put_page(src_page);
-		if (err)
-			return err;
-		pending_size += params->digest_size;
-
-		if (level == params->num_levels) /* Root hash? */
-			return 0;
-
-		if (pending_size + params->digest_size > params->block_size ||
-		    i + 1 == num_blocks_to_hash) {
-			/* Flush the pending hash block */
-			memset(&pending_hashes[pending_size], 0,
-			       params->block_size - pending_size);
-			err = vops->write_merkle_tree_block(inode,
-					pending_hashes,
-					dst_block_num,
-					params->log_blocksize);
-			if (err) {
-				fsverity_err(inode,
-					     "Error %d writing Merkle tree block %llu",
-					     err, dst_block_num);
-				return err;
-			}
-			dst_block_num++;
-			pending_size = 0;
-		}
+static int write_merkle_tree_block(struct inode *inode, const u8 *buf,
+				   unsigned long index,
+				   const struct merkle_tree_params *params)
+{
+	u64 pos = (u64)index << params->log_blocksize;
+	int err;
 
-		if (fatal_signal_pending(current))
-			return -EINTR;
-		cond_resched();
-	}
-	return 0;
+	err = inode->i_sb->s_vop->write_merkle_tree_block(inode, buf, pos,
+							  params->block_size);
+	if (err)
+		fsverity_err(inode, "Error %d writing Merkle tree block %lu",
+			     err, index);
+	return err;
 }
 
 /*
@@ -152,13 +66,17 @@ static int build_merkle_tree(struct file *filp,
 			     u8 *root_hash)
 {
 	struct inode *inode = file_inode(filp);
-	u8 *pending_hashes;
+	const u64 data_size = inode->i_size;
+	const int num_levels = params->num_levels;
 	struct ahash_request *req;
-	u64 blocks;
-	unsigned int level;
-	int err = -ENOMEM;
+	struct block_buffer _buffers[1 + FS_VERITY_MAX_LEVELS + 1] = {};
+	struct block_buffer *buffers = &_buffers[1];
+	unsigned long level_offset[FS_VERITY_MAX_LEVELS];
+	int level;
+	u64 offset;
+	int err;
 
-	if (inode->i_size == 0) {
+	if (data_size == 0) {
 		/* Empty file is a special case; root hash is all 0's */
 		memset(root_hash, 0, params->digest_size);
 		return 0;
@@ -167,29 +85,95 @@ static int build_merkle_tree(struct file *filp,
 	/* This allocation never fails, since it's mempool-backed. */
 	req = fsverity_alloc_hash_request(params->hash_alg, GFP_KERNEL);
 
-	pending_hashes = kmalloc(params->block_size, GFP_KERNEL);
-	if (!pending_hashes)
-		goto out;
-
 	/*
-	 * Build each level of the Merkle tree, starting at the leaf level
-	 * (level 0) and ascending to the root node (level 'num_levels - 1').
-	 * Then at the end (level 'num_levels'), calculate the root hash.
+	 * Allocate the block buffers.  Buffer "-1" is for data blocks.
+	 * Buffers 0 <= level < num_levels are for the actual tree levels.
+	 * Buffer 'num_levels' is for the root hash.
 	 */
-	blocks = ((u64)inode->i_size + params->block_size - 1) >>
-		 params->log_blocksize;
-	for (level = 0; level <= params->num_levels; level++) {
-		err = build_merkle_tree_level(filp, level, blocks, params,
-					      pending_hashes, req);
+	for (level = -1; level < num_levels; level++) {
+		buffers[level].data = kzalloc(params->block_size, GFP_KERNEL);
+		if (!buffers[level].data) {
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+	buffers[num_levels].data = root_hash;
+
+	BUILD_BUG_ON(sizeof(level_offset) != sizeof(params->level_start));
+	memcpy(level_offset, params->level_start, sizeof(level_offset));
+
+	/* Hash each data block, also hashing the tree blocks as they fill up */
+	for (offset = 0; offset < data_size; offset += params->block_size) {
+		ssize_t bytes_read;
+		loff_t pos = offset;
+
+		buffers[-1].filled = min_t(u64, params->block_size,
+					   data_size - offset);
+		bytes_read = __kernel_read(filp, buffers[-1].data,
+					   buffers[-1].filled, &pos);
+		if (bytes_read < 0) {
+			err = bytes_read;
+			fsverity_err(inode, "Error %d reading file data", err);
+			goto out;
+		}
+		if (bytes_read != buffers[-1].filled) {
+			err = -EINVAL;
+			fsverity_err(inode, "Short read of file data");
+			goto out;
+		}
+		err = hash_one_block(inode, params, req, &buffers[-1]);
 		if (err)
 			goto out;
-		blocks = (blocks + params->hashes_per_block - 1) >>
-			 params->log_arity;
+		for (level = 0; level < num_levels; level++) {
+			if (buffers[level].filled + params->digest_size <=
+			    params->block_size) {
+				/* Next block at @level isn't full yet */
+				break;
+			}
+			/* Next block at @level is full */
+
+			err = hash_one_block(inode, params, req,
+					     &buffers[level]);
+			if (err)
+				goto out;
+			err = write_merkle_tree_block(inode,
+						      buffers[level].data,
+						      level_offset[level],
+						      params);
+			if (err)
+				goto out;
+			level_offset[level]++;
+		}
+		if (fatal_signal_pending(current)) {
+			err = -EINTR;
+			goto out;
+		}
+		cond_resched();
+	}
+	/* Finish all nonempty pending tree blocks. */
+	for (level = 0; level < num_levels; level++) {
+		if (buffers[level].filled != 0) {
+			err = hash_one_block(inode, params, req,
+					     &buffers[level]);
+			if (err)
+				goto out;
+			err = write_merkle_tree_block(inode,
+						      buffers[level].data,
+						      level_offset[level],
+						      params);
+			if (err)
+				goto out;
+		}
+	}
+	/* The root hash was filled by the last call to hash_one_block(). */
+	if (WARN_ON(buffers[num_levels].filled != params->digest_size)) {
+		err = -EINVAL;
+		goto out;
 	}
-	memcpy(root_hash, pending_hashes, params->digest_size);
 	err = 0;
 out:
-	kfree(pending_hashes);
+	for (level = -1; level < num_levels; level++)
+		kfree(buffers[level].data);
 	fsverity_free_hash_request(params->hash_alg, req);
 	return err;
 }
@@ -263,15 +247,12 @@ static int enable_verity(struct file *filp,
 	 * ->begin_enable_verity() and ->end_enable_verity() using the inode
 	 * lock and only allow one process to be here at a time on a given file.
 	 */
-	pr_debug("Building Merkle tree...\n");
 	BUILD_BUG_ON(sizeof(desc->root_hash) < FS_VERITY_MAX_DIGEST_SIZE);
 	err = build_merkle_tree(filp, &params, desc->root_hash);
 	if (err) {
 		fsverity_err(inode, "Error %d building Merkle tree", err);
 		goto rollback;
 	}
-	pr_debug("Done building Merkle tree.  Root hash is %s:%*phN\n",
-		 params.hash_alg->name, params.digest_size, desc->root_hash);
 
 	/*
 	 * Create the fsverity_info.  Don't bother trying to save work by
@@ -286,10 +267,6 @@ static int enable_verity(struct file *filp,
 		goto rollback;
 	}
 
-	if (arg->sig_size)
-		pr_debug("Storing a %u-byte PKCS#7 signature alongside the file\n",
-			 arg->sig_size);
-
 	/*
 	 * Tell the filesystem to finish enabling verity on the file.
 	 * Serialized with ->begin_enable_verity() by the inode lock.
@@ -352,7 +329,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
 	    memchr_inv(arg.__reserved2, 0, sizeof(arg.__reserved2)))
 		return -EINVAL;
 
-	if (arg.block_size != PAGE_SIZE)
+	if (!is_power_of_2(arg.block_size))
 		return -EINVAL;
 
 	if (arg.salt_size > sizeof_field(struct fsverity_descriptor, salt))
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index c7fcb855e068..d34dcc033d72 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -8,10 +8,6 @@
 #ifndef _FSVERITY_PRIVATE_H
 #define _FSVERITY_PRIVATE_H
 
-#ifdef CONFIG_FS_VERITY_DEBUG
-#define DEBUG
-#endif
-
 #define pr_fmt(fmt) "fs-verity: " fmt
 
 #include <linux/fsverity.h>
@@ -46,17 +42,20 @@ struct merkle_tree_params {
 	unsigned int digest_size;	/* same as hash_alg->digest_size */
 	unsigned int block_size;	/* size of data and tree blocks */
 	unsigned int hashes_per_block;	/* number of hashes per tree block */
-	unsigned int log_blocksize;	/* log2(block_size) */
-	unsigned int log_arity;		/* log2(hashes_per_block) */
+	unsigned int blocks_per_page;	/* PAGE_SIZE / block_size */
+	u8 log_digestsize;		/* log2(digest_size) */
+	u8 log_blocksize;		/* log2(block_size) */
+	u8 log_arity;			/* log2(hashes_per_block) */
+	u8 log_blocks_per_page;		/* log2(blocks_per_page) */
 	unsigned int num_levels;	/* number of levels in Merkle tree */
 	u64 tree_size;			/* Merkle tree size in bytes */
-	unsigned long level0_blocks;	/* number of blocks in tree level 0 */
+	unsigned long tree_pages;	/* Merkle tree size in pages */
 
 	/*
 	 * Starting block index for each tree level, ordered from leaf level (0)
 	 * to root level ('num_levels - 1')
 	 */
-	u64 level_start[FS_VERITY_MAX_LEVELS];
+	unsigned long level_start[FS_VERITY_MAX_LEVELS];
 };
 
 /*
@@ -73,9 +72,10 @@ struct fsverity_info {
 	u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE];
 	u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE];
 	const struct inode *inode;
+	unsigned long *hash_block_verified;
+	spinlock_t hash_page_init_lock;
 };
 
-
 #define FS_VERITY_MAX_SIGNATURE_SIZE	(FS_VERITY_MAX_DESCRIPTOR_SIZE - \
 					 sizeof(struct fsverity_descriptor))
 
@@ -91,9 +91,9 @@ void fsverity_free_hash_request(struct fsverity_hash_alg *alg,
 				struct ahash_request *req);
 const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg,
 				      const u8 *salt, size_t salt_size);
-int fsverity_hash_page(const struct merkle_tree_params *params,
-		       const struct inode *inode,
-		       struct ahash_request *req, struct page *page, u8 *out);
+int fsverity_hash_block(const struct merkle_tree_params *params,
+			const struct inode *inode, struct ahash_request *req,
+			struct page *page, unsigned int offset, u8 *out);
 int fsverity_hash_buffer(struct fsverity_hash_alg *alg,
 			 const void *data, size_t size, u8 *out);
 void __init fsverity_check_hash_algs(void);
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 6f8170cf4ae7..13fcf31be844 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -220,35 +220,33 @@ err_free:
 }
 
 /**
- * fsverity_hash_page() - hash a single data or hash page
+ * fsverity_hash_block() - hash a single data or hash block
  * @params: the Merkle tree's parameters
  * @inode: inode for which the hashing is being done
  * @req: preallocated hash request
- * @page: the page to hash
+ * @page: the page containing the block to hash
+ * @offset: the offset of the block within @page
  * @out: output digest, size 'params->digest_size' bytes
  *
- * Hash a single data or hash block, assuming block_size == PAGE_SIZE.
- * The hash is salted if a salt is specified in the Merkle tree parameters.
+ * Hash a single data or hash block.  The hash is salted if a salt is specified
+ * in the Merkle tree parameters.
  *
  * Return: 0 on success, -errno on failure
  */
-int fsverity_hash_page(const struct merkle_tree_params *params,
-		       const struct inode *inode,
-		       struct ahash_request *req, struct page *page, u8 *out)
+int fsverity_hash_block(const struct merkle_tree_params *params,
+			const struct inode *inode, struct ahash_request *req,
+			struct page *page, unsigned int offset, u8 *out)
 {
 	struct scatterlist sg;
 	DECLARE_CRYPTO_WAIT(wait);
 	int err;
 
-	if (WARN_ON(params->block_size != PAGE_SIZE))
-		return -EINVAL;
-
 	sg_init_table(&sg, 1);
-	sg_set_page(&sg, page, PAGE_SIZE, 0);
+	sg_set_page(&sg, page, params->block_size, offset);
 	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
 					CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   crypto_req_done, &wait);
-	ahash_request_set_crypt(req, &sg, out, PAGE_SIZE);
+	ahash_request_set_crypt(req, &sg, out, params->block_size);
 
 	if (params->hashstate) {
 		err = crypto_ahash_import(req, params->hashstate);
@@ -264,7 +262,7 @@ int fsverity_hash_page(const struct merkle_tree_params *params,
 
 	err = crypto_wait_req(err, &wait);
 	if (err)
-		fsverity_err(inode, "Error %d computing page hash", err);
+		fsverity_err(inode, "Error %d computing block hash", err);
 	return err;
 }
 
diff --git a/fs/verity/init.c b/fs/verity/init.c
index c98b7016f446..023905151035 100644
--- a/fs/verity/init.c
+++ b/fs/verity/init.c
@@ -49,7 +49,6 @@ static int __init fsverity_init(void)
 	if (err)
 		goto err_exit_workqueue;
 
-	pr_debug("Initialized fs-verity\n");
 	return 0;
 
 err_exit_workqueue:
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 81ff94442f7b..9366b441d01c 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -7,6 +7,7 @@
 
 #include "fsverity_private.h"
 
+#include <linux/mm.h>
 #include <linux/slab.h>
 
 static struct kmem_cache *fsverity_info_cachep;
@@ -34,6 +35,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
 	struct fsverity_hash_alg *hash_alg;
 	int err;
 	u64 blocks;
+	u64 blocks_in_level[FS_VERITY_MAX_LEVELS];
 	u64 offset;
 	int level;
 
@@ -54,7 +56,23 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
 		goto out_err;
 	}
 
-	if (log_blocksize != PAGE_SHIFT) {
+	/*
+	 * fs/verity/ directly assumes that the Merkle tree block size is a
+	 * power of 2 less than or equal to PAGE_SIZE.  Another restriction
+	 * arises from the interaction between fs/verity/ and the filesystems
+	 * themselves: filesystems expect to be able to verify a single
+	 * filesystem block of data at a time.  Therefore, the Merkle tree block
+	 * size must also be less than or equal to the filesystem block size.
+	 *
+	 * The above are the only hard limitations, so in theory the Merkle tree
+	 * block size could be as small as twice the digest size.  However,
+	 * that's not useful, and it would result in some unusually deep and
+	 * large Merkle trees.  So we currently require that the Merkle tree
+	 * block size be at least 1024 bytes.  That's small enough to test the
+	 * sub-page block case on systems with 4K pages, but not too small.
+	 */
+	if (log_blocksize < 10 || log_blocksize > PAGE_SHIFT ||
+	    log_blocksize > inode->i_blkbits) {
 		fsverity_warn(inode, "Unsupported log_blocksize: %u",
 			      log_blocksize);
 		err = -EINVAL;
@@ -62,6 +80,8 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
 	}
 	params->log_blocksize = log_blocksize;
 	params->block_size = 1 << log_blocksize;
+	params->log_blocks_per_page = PAGE_SHIFT - log_blocksize;
+	params->blocks_per_page = 1 << params->log_blocks_per_page;
 
 	if (WARN_ON(!is_power_of_2(params->digest_size))) {
 		err = -EINVAL;
@@ -74,13 +94,10 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
 		err = -EINVAL;
 		goto out_err;
 	}
-	params->log_arity = params->log_blocksize - ilog2(params->digest_size);
+	params->log_digestsize = ilog2(params->digest_size);
+	params->log_arity = log_blocksize - params->log_digestsize;
 	params->hashes_per_block = 1 << params->log_arity;
 
-	pr_debug("Merkle tree uses %s with %u-byte blocks (%u hashes/block), salt=%*phN\n",
-		 hash_alg->name, params->block_size, params->hashes_per_block,
-		 (int)salt_size, salt);
-
 	/*
 	 * Compute the number of levels in the Merkle tree and create a map from
 	 * level to the starting block of that level.  Level 'num_levels - 1' is
@@ -90,31 +107,45 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
 
 	/* Compute number of levels and the number of blocks in each level */
 	blocks = ((u64)inode->i_size + params->block_size - 1) >> log_blocksize;
-	pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks);
 	while (blocks > 1) {
 		if (params->num_levels >= FS_VERITY_MAX_LEVELS) {
 			fsverity_err(inode, "Too many levels in Merkle tree");
-			err = -EINVAL;
+			err = -EFBIG;
 			goto out_err;
 		}
 		blocks = (blocks + params->hashes_per_block - 1) >>
 			 params->log_arity;
-		/* temporarily using level_start[] to store blocks in level */
-		params->level_start[params->num_levels++] = blocks;
+		blocks_in_level[params->num_levels++] = blocks;
 	}
-	params->level0_blocks = params->level_start[0];
 
 	/* Compute the starting block of each level */
 	offset = 0;
 	for (level = (int)params->num_levels - 1; level >= 0; level--) {
-		blocks = params->level_start[level];
 		params->level_start[level] = offset;
-		pr_debug("Level %d is %llu blocks starting at index %llu\n",
-			 level, blocks, offset);
-		offset += blocks;
+		offset += blocks_in_level[level];
+	}
+
+	/*
+	 * With block_size != PAGE_SIZE, an in-memory bitmap will need to be
+	 * allocated to track the "verified" status of hash blocks.  Don't allow
+	 * this bitmap to get too large.  For now, limit it to 1 MiB, which
+	 * limits the file size to about 4.4 TB with SHA-256 and 4K blocks.
+	 *
+	 * Together with the fact that the data, and thus also the Merkle tree,
+	 * cannot have more than ULONG_MAX pages, this implies that hash block
+	 * indices can always fit in an 'unsigned long'.  But to be safe, we
+	 * explicitly check for that too.  Note, this is only for hash block
+	 * indices; data block indices might not fit in an 'unsigned long'.
+	 */
+	if ((params->block_size != PAGE_SIZE && offset > 1 << 23) ||
+	    offset > ULONG_MAX) {
+		fsverity_err(inode, "Too many blocks in Merkle tree");
+		err = -EFBIG;
+		goto out_err;
 	}
 
 	params->tree_size = offset << log_blocksize;
+	params->tree_pages = PAGE_ALIGN(params->tree_size) >> PAGE_SHIFT;
 	return 0;
 
 out_err:
@@ -165,7 +196,7 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
 		fsverity_err(inode,
 			     "Error %d initializing Merkle tree parameters",
 			     err);
-		goto out;
+		goto fail;
 	}
 
 	memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size);
@@ -174,20 +205,48 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
 				  vi->file_digest);
 	if (err) {
 		fsverity_err(inode, "Error %d computing file digest", err);
-		goto out;
+		goto fail;
 	}
-	pr_debug("Computed file digest: %s:%*phN\n",
-		 vi->tree_params.hash_alg->name,
-		 vi->tree_params.digest_size, vi->file_digest);
 
 	err = fsverity_verify_signature(vi, desc->signature,
 					le32_to_cpu(desc->sig_size));
-out:
-	if (err) {
-		fsverity_free_info(vi);
-		vi = ERR_PTR(err);
+	if (err)
+		goto fail;
+
+	if (vi->tree_params.block_size != PAGE_SIZE) {
+		/*
+		 * When the Merkle tree block size and page size differ, we use
+		 * a bitmap to keep track of which hash blocks have been
+		 * verified.  This bitmap must contain one bit per hash block,
+		 * including alignment to a page boundary at the end.
+		 *
+		 * Eventually, to support extremely large files in an efficient
+		 * way, it might be necessary to make pages of this bitmap
+		 * reclaimable.  But for now, simply allocating the whole bitmap
+		 * is a simple solution that works well on the files on which
+		 * fsverity is realistically used.  E.g., with SHA-256 and 4K
+		 * blocks, a 100MB file only needs a 24-byte bitmap, and the
+		 * bitmap for any file under 17GB fits in a 4K page.
+		 */
+		unsigned long num_bits =
+			vi->tree_params.tree_pages <<
+			vi->tree_params.log_blocks_per_page;
+
+		vi->hash_block_verified = kvcalloc(BITS_TO_LONGS(num_bits),
+						   sizeof(unsigned long),
+						   GFP_KERNEL);
+		if (!vi->hash_block_verified) {
+			err = -ENOMEM;
+			goto fail;
+		}
+		spin_lock_init(&vi->hash_page_init_lock);
 	}
+
 	return vi;
+
+fail:
+	fsverity_free_info(vi);
+	return ERR_PTR(err);
 }
 
 void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
@@ -214,6 +273,7 @@ void fsverity_free_info(struct fsverity_info *vi)
 	if (!vi)
 		return;
 	kfree(vi->tree_params.hashstate);
+	kvfree(vi->hash_block_verified);
 	kmem_cache_free(fsverity_info_cachep, vi);
 }
 
@@ -325,67 +385,28 @@ out_free_desc:
 	return err;
 }
 
-/**
- * fsverity_file_open() - prepare to open a verity file
- * @inode: the inode being opened
- * @filp: the struct file being set up
- *
- * When opening a verity file, deny the open if it is for writing.  Otherwise,
- * set up the inode's ->i_verity_info if not already done.
- *
- * When combined with fscrypt, this must be called after fscrypt_file_open().
- * Otherwise, we won't have the key set up to decrypt the verity metadata.
- *
- * Return: 0 on success, -errno on failure
- */
-int fsverity_file_open(struct inode *inode, struct file *filp)
+int __fsverity_file_open(struct inode *inode, struct file *filp)
 {
-	if (!IS_VERITY(inode))
-		return 0;
-
-	if (filp->f_mode & FMODE_WRITE) {
-		pr_debug("Denying opening verity file (ino %lu) for write\n",
-			 inode->i_ino);
+	if (filp->f_mode & FMODE_WRITE)
 		return -EPERM;
-	}
-
 	return ensure_verity_info(inode);
 }
-EXPORT_SYMBOL_GPL(fsverity_file_open);
+EXPORT_SYMBOL_GPL(__fsverity_file_open);
 
-/**
- * fsverity_prepare_setattr() - prepare to change a verity inode's attributes
- * @dentry: dentry through which the inode is being changed
- * @attr: attributes to change
- *
- * Verity files are immutable, so deny truncates.  This isn't covered by the
- * open-time check because sys_truncate() takes a path, not a file descriptor.
- *
- * Return: 0 on success, -errno on failure
- */
-int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr)
+int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr)
 {
-	if (IS_VERITY(d_inode(dentry)) && (attr->ia_valid & ATTR_SIZE)) {
-		pr_debug("Denying truncate of verity file (ino %lu)\n",
-			 d_inode(dentry)->i_ino);
+	if (attr->ia_valid & ATTR_SIZE)
 		return -EPERM;
-	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(fsverity_prepare_setattr);
+EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr);
 
-/**
- * fsverity_cleanup_inode() - free the inode's verity info, if present
- * @inode: an inode being evicted
- *
- * Filesystems must call this on inode eviction to free ->i_verity_info.
- */
-void fsverity_cleanup_inode(struct inode *inode)
+void __fsverity_cleanup_inode(struct inode *inode)
 {
 	fsverity_free_info(inode->i_verity_info);
 	inode->i_verity_info = NULL;
 }
-EXPORT_SYMBOL_GPL(fsverity_cleanup_inode);
+EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode);
 
 int __init fsverity_init_info_cache(void)
 {
diff --git a/fs/verity/signature.c b/fs/verity/signature.c
index 143a530a8008..e7d3ca919a1e 100644
--- a/fs/verity/signature.c
+++ b/fs/verity/signature.c
@@ -82,8 +82,6 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
 		return err;
 	}
 
-	pr_debug("Valid signature for file digest %s:%*phN\n",
-		 hash_alg->name, hash_alg->digest_size, vi->file_digest);
 	return 0;
 }
 
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 961ba248021f..f50e3b5b52c9 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -9,39 +9,12 @@
 
 #include <crypto/hash.h>
 #include <linux/bio.h>
-#include <linux/ratelimit.h>
 
 static struct workqueue_struct *fsverity_read_workqueue;
 
-/**
- * hash_at_level() - compute the location of the block's hash at the given level
- *
- * @params:	(in) the Merkle tree parameters
- * @dindex:	(in) the index of the data block being verified
- * @level:	(in) the level of hash we want (0 is leaf level)
- * @hindex:	(out) the index of the hash block containing the wanted hash
- * @hoffset:	(out) the byte offset to the wanted hash within the hash block
- */
-static void hash_at_level(const struct merkle_tree_params *params,
-			  pgoff_t dindex, unsigned int level, pgoff_t *hindex,
-			  unsigned int *hoffset)
-{
-	pgoff_t position;
-
-	/* Offset of the hash within the level's region, in hashes */
-	position = dindex >> (level * params->log_arity);
-
-	/* Index of the hash block in the tree overall */
-	*hindex = params->level_start[level] + (position >> params->log_arity);
-
-	/* Offset of the wanted hash (in bytes) within the hash block */
-	*hoffset = (position & ((1 << params->log_arity) - 1)) <<
-		   (params->log_blocksize - params->log_arity);
-}
-
 static inline int cmp_hashes(const struct fsverity_info *vi,
 			     const u8 *want_hash, const u8 *real_hash,
-			     pgoff_t index, int level)
+			     u64 data_pos, int level)
 {
 	const unsigned int hsize = vi->tree_params.digest_size;
 
@@ -49,159 +22,312 @@ static inline int cmp_hashes(const struct fsverity_info *vi,
 		return 0;
 
 	fsverity_err(vi->inode,
-		     "FILE CORRUPTED! index=%lu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
-		     index, level,
+		     "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
+		     data_pos, level,
 		     vi->tree_params.hash_alg->name, hsize, want_hash,
 		     vi->tree_params.hash_alg->name, hsize, real_hash);
 	return -EBADMSG;
 }
 
+static bool data_is_zeroed(struct inode *inode, struct page *page,
+			   unsigned int len, unsigned int offset)
+{
+	void *virt = kmap_local_page(page);
+
+	if (memchr_inv(virt + offset, 0, len)) {
+		kunmap_local(virt);
+		fsverity_err(inode,
+			     "FILE CORRUPTED!  Data past EOF is not zeroed");
+		return false;
+	}
+	kunmap_local(virt);
+	return true;
+}
+
+/*
+ * Returns true if the hash block with index @hblock_idx in the tree, located in
+ * @hpage, has already been verified.
+ */
+static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
+				   unsigned long hblock_idx)
+{
+	bool verified;
+	unsigned int blocks_per_page;
+	unsigned int i;
+
+	/*
+	 * When the Merkle tree block size and page size are the same, then the
+	 * ->hash_block_verified bitmap isn't allocated, and we use PG_checked
+	 * to directly indicate whether the page's block has been verified.
+	 *
+	 * Using PG_checked also guarantees that we re-verify hash pages that
+	 * get evicted and re-instantiated from the backing storage, as new
+	 * pages always start out with PG_checked cleared.
+	 */
+	if (!vi->hash_block_verified)
+		return PageChecked(hpage);
+
+	/*
+	 * When the Merkle tree block size and page size differ, we use a bitmap
+	 * to indicate whether each hash block has been verified.
+	 *
+	 * However, we still need to ensure that hash pages that get evicted and
+	 * re-instantiated from the backing storage are re-verified.  To do
+	 * this, we use PG_checked again, but now it doesn't really mean
+	 * "checked".  Instead, now it just serves as an indicator for whether
+	 * the hash page is newly instantiated or not.
+	 *
+	 * The first thread that sees PG_checked=0 must clear the corresponding
+	 * bitmap bits, then set PG_checked=1.  This requires a spinlock.  To
+	 * avoid having to take this spinlock in the common case of
+	 * PG_checked=1, we start with an opportunistic lockless read.
+	 */
+	if (PageChecked(hpage)) {
+		/*
+		 * A read memory barrier is needed here to give ACQUIRE
+		 * semantics to the above PageChecked() test.
+		 */
+		smp_rmb();
+		return test_bit(hblock_idx, vi->hash_block_verified);
+	}
+	spin_lock(&vi->hash_page_init_lock);
+	if (PageChecked(hpage)) {
+		verified = test_bit(hblock_idx, vi->hash_block_verified);
+	} else {
+		blocks_per_page = vi->tree_params.blocks_per_page;
+		hblock_idx = round_down(hblock_idx, blocks_per_page);
+		for (i = 0; i < blocks_per_page; i++)
+			clear_bit(hblock_idx + i, vi->hash_block_verified);
+		/*
+		 * A write memory barrier is needed here to give RELEASE
+		 * semantics to the below SetPageChecked() operation.
+		 */
+		smp_wmb();
+		SetPageChecked(hpage);
+		verified = false;
+	}
+	spin_unlock(&vi->hash_page_init_lock);
+	return verified;
+}
+
 /*
- * Verify a single data page against the file's Merkle tree.
+ * Verify a single data block against the file's Merkle tree.
  *
  * In principle, we need to verify the entire path to the root node.  However,
- * for efficiency the filesystem may cache the hash pages.  Therefore we need
- * only ascend the tree until an already-verified page is seen, as indicated by
- * the PageChecked bit being set; then verify the path to that page.
- *
- * This code currently only supports the case where the verity block size is
- * equal to PAGE_SIZE.  Doing otherwise would be possible but tricky, since we
- * wouldn't be able to use the PageChecked bit.
- *
- * Note that multiple processes may race to verify a hash page and mark it
- * Checked, but it doesn't matter; the result will be the same either way.
+ * for efficiency the filesystem may cache the hash blocks.  Therefore we need
+ * only ascend the tree until an already-verified hash block is seen, and then
+ * verify the path to that block.
  *
- * Return: true if the page is valid, else false.
+ * Return: %true if the data block is valid, else %false.
  */
-static bool verify_page(struct inode *inode, const struct fsverity_info *vi,
-			struct ahash_request *req, struct page *data_page,
-			unsigned long level0_ra_pages)
+static bool
+verify_data_block(struct inode *inode, struct fsverity_info *vi,
+		  struct ahash_request *req, struct page *data_page,
+		  u64 data_pos, unsigned int dblock_offset_in_page,
+		  unsigned long max_ra_pages)
 {
 	const struct merkle_tree_params *params = &vi->tree_params;
 	const unsigned int hsize = params->digest_size;
-	const pgoff_t index = data_page->index;
 	int level;
 	u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE];
 	const u8 *want_hash;
 	u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE];
-	struct page *hpages[FS_VERITY_MAX_LEVELS];
-	unsigned int hoffsets[FS_VERITY_MAX_LEVELS];
+	/* The hash blocks that are traversed, indexed by level */
+	struct {
+		/* Page containing the hash block */
+		struct page *page;
+		/* Index of the hash block in the tree overall */
+		unsigned long index;
+		/* Byte offset of the hash block within @page */
+		unsigned int offset_in_page;
+		/* Byte offset of the wanted hash within @page */
+		unsigned int hoffset;
+	} hblocks[FS_VERITY_MAX_LEVELS];
+	/*
+	 * The index of the previous level's block within that level; also the
+	 * index of that block's hash within the current level.
+	 */
+	u64 hidx = data_pos >> params->log_blocksize;
 	int err;
 
-	if (WARN_ON_ONCE(!PageLocked(data_page) || PageUptodate(data_page)))
-		return false;
-
-	pr_debug_ratelimited("Verifying data page %lu...\n", index);
+	if (unlikely(data_pos >= inode->i_size)) {
+		/*
+		 * This can happen in the data page spanning EOF when the Merkle
+		 * tree block size is less than the page size.  The Merkle tree
+		 * doesn't cover data blocks fully past EOF.  But the entire
+		 * page spanning EOF can be visible to userspace via a mmap, and
+		 * any part past EOF should be all zeroes.  Therefore, we need
+		 * to verify that any data blocks fully past EOF are all zeroes.
+		 */
+		return data_is_zeroed(inode, data_page, params->block_size,
+				      dblock_offset_in_page);
+	}
 
 	/*
-	 * Starting at the leaf level, ascend the tree saving hash pages along
-	 * the way until we find a verified hash page, indicated by PageChecked;
-	 * or until we reach the root.
+	 * Starting at the leaf level, ascend the tree saving hash blocks along
+	 * the way until we find a hash block that has already been verified, or
+	 * until we reach the root.
 	 */
 	for (level = 0; level < params->num_levels; level++) {
-		pgoff_t hindex;
+		unsigned long next_hidx;
+		unsigned long hblock_idx;
+		pgoff_t hpage_idx;
+		unsigned int hblock_offset_in_page;
 		unsigned int hoffset;
 		struct page *hpage;
 
-		hash_at_level(params, index, level, &hindex, &hoffset);
+		/*
+		 * The index of the block in the current level; also the index
+		 * of that block's hash within the next level.
+		 */
+		next_hidx = hidx >> params->log_arity;
+
+		/* Index of the hash block in the tree overall */
+		hblock_idx = params->level_start[level] + next_hidx;
+
+		/* Index of the hash page in the tree overall */
+		hpage_idx = hblock_idx >> params->log_blocks_per_page;
 
-		pr_debug_ratelimited("Level %d: hindex=%lu, hoffset=%u\n",
-				     level, hindex, hoffset);
+		/* Byte offset of the hash block within the page */
+		hblock_offset_in_page =
+			(hblock_idx << params->log_blocksize) & ~PAGE_MASK;
 
-		hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, hindex,
-				level == 0 ? level0_ra_pages : 0);
+		/* Byte offset of the hash within the page */
+		hoffset = hblock_offset_in_page +
+			  ((hidx << params->log_digestsize) &
+			   (params->block_size - 1));
+
+		hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode,
+				hpage_idx, level == 0 ? min(max_ra_pages,
+					params->tree_pages - hpage_idx) : 0);
 		if (IS_ERR(hpage)) {
 			err = PTR_ERR(hpage);
 			fsverity_err(inode,
 				     "Error %d reading Merkle tree page %lu",
-				     err, hindex);
+				     err, hpage_idx);
 			goto out;
 		}
-
-		if (PageChecked(hpage)) {
+		if (is_hash_block_verified(vi, hpage, hblock_idx)) {
 			memcpy_from_page(_want_hash, hpage, hoffset, hsize);
 			want_hash = _want_hash;
 			put_page(hpage);
-			pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n",
-					     params->hash_alg->name,
-					     hsize, want_hash);
 			goto descend;
 		}
-		pr_debug_ratelimited("Hash page not yet checked\n");
-		hpages[level] = hpage;
-		hoffsets[level] = hoffset;
+		hblocks[level].page = hpage;
+		hblocks[level].index = hblock_idx;
+		hblocks[level].offset_in_page = hblock_offset_in_page;
+		hblocks[level].hoffset = hoffset;
+		hidx = next_hidx;
 	}
 
 	want_hash = vi->root_hash;
-	pr_debug("Want root hash: %s:%*phN\n",
-		 params->hash_alg->name, hsize, want_hash);
 descend:
-	/* Descend the tree verifying hash pages */
+	/* Descend the tree verifying hash blocks. */
 	for (; level > 0; level--) {
-		struct page *hpage = hpages[level - 1];
-		unsigned int hoffset = hoffsets[level - 1];
-
-		err = fsverity_hash_page(params, inode, req, hpage, real_hash);
+		struct page *hpage = hblocks[level - 1].page;
+		unsigned long hblock_idx = hblocks[level - 1].index;
+		unsigned int hblock_offset_in_page =
+			hblocks[level - 1].offset_in_page;
+		unsigned int hoffset = hblocks[level - 1].hoffset;
+
+		err = fsverity_hash_block(params, inode, req, hpage,
+					  hblock_offset_in_page, real_hash);
 		if (err)
 			goto out;
-		err = cmp_hashes(vi, want_hash, real_hash, index, level - 1);
+		err = cmp_hashes(vi, want_hash, real_hash, data_pos, level - 1);
 		if (err)
 			goto out;
-		SetPageChecked(hpage);
+		/*
+		 * Mark the hash block as verified.  This must be atomic and
+		 * idempotent, as the same hash block might be verified by
+		 * multiple threads concurrently.
+		 */
+		if (vi->hash_block_verified)
+			set_bit(hblock_idx, vi->hash_block_verified);
+		else
+			SetPageChecked(hpage);
 		memcpy_from_page(_want_hash, hpage, hoffset, hsize);
 		want_hash = _want_hash;
 		put_page(hpage);
-		pr_debug("Verified hash page at level %d, now want %s:%*phN\n",
-			 level - 1, params->hash_alg->name, hsize, want_hash);
 	}
 
-	/* Finally, verify the data page */
-	err = fsverity_hash_page(params, inode, req, data_page, real_hash);
+	/* Finally, verify the data block. */
+	err = fsverity_hash_block(params, inode, req, data_page,
+				  dblock_offset_in_page, real_hash);
 	if (err)
 		goto out;
-	err = cmp_hashes(vi, want_hash, real_hash, index, -1);
+	err = cmp_hashes(vi, want_hash, real_hash, data_pos, -1);
 out:
 	for (; level > 0; level--)
-		put_page(hpages[level - 1]);
+		put_page(hblocks[level - 1].page);
 
 	return err == 0;
 }
 
+static bool
+verify_data_blocks(struct inode *inode, struct fsverity_info *vi,
+		   struct ahash_request *req, struct folio *data_folio,
+		   size_t len, size_t offset, unsigned long max_ra_pages)
+{
+	const unsigned int block_size = vi->tree_params.block_size;
+	u64 pos = (u64)data_folio->index << PAGE_SHIFT;
+
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size)))
+		return false;
+	if (WARN_ON_ONCE(!folio_test_locked(data_folio) ||
+			 folio_test_uptodate(data_folio)))
+		return false;
+	do {
+		struct page *data_page =
+			folio_page(data_folio, offset >> PAGE_SHIFT);
+
+		if (!verify_data_block(inode, vi, req, data_page, pos + offset,
+				       offset & ~PAGE_MASK, max_ra_pages))
+			return false;
+		offset += block_size;
+		len -= block_size;
+	} while (len);
+	return true;
+}
+
 /**
- * fsverity_verify_page() - verify a data page
- * @page: the page to verity
+ * fsverity_verify_blocks() - verify data in a folio
+ * @folio: the folio containing the data to verify
+ * @len: the length of the data to verify in the folio
+ * @offset: the offset of the data to verify in the folio
  *
- * Verify a page that has just been read from a verity file.  The page must be a
- * pagecache page that is still locked and not yet uptodate.
+ * Verify data that has just been read from a verity file.  The data must be
+ * located in a pagecache folio that is still locked and not yet uptodate.  The
+ * length and offset of the data must be Merkle tree block size aligned.
  *
- * Return: true if the page is valid, else false.
+ * Return: %true if the data is valid, else %false.
  */
-bool fsverity_verify_page(struct page *page)
+bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset)
 {
-	struct inode *inode = page->mapping->host;
-	const struct fsverity_info *vi = inode->i_verity_info;
+	struct inode *inode = folio->mapping->host;
+	struct fsverity_info *vi = inode->i_verity_info;
 	struct ahash_request *req;
 	bool valid;
 
 	/* This allocation never fails, since it's mempool-backed. */
 	req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS);
 
-	valid = verify_page(inode, vi, req, page, 0);
+	valid = verify_data_blocks(inode, vi, req, folio, len, offset, 0);
 
 	fsverity_free_hash_request(vi->tree_params.hash_alg, req);
 
 	return valid;
 }
-EXPORT_SYMBOL_GPL(fsverity_verify_page);
+EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
 
 #ifdef CONFIG_BLOCK
 /**
  * fsverity_verify_bio() - verify a 'read' bio that has just completed
  * @bio: the bio to verify
  *
- * Verify a set of pages that have just been read from a verity file.  The pages
- * must be pagecache pages that are still locked and not yet uptodate.  If a
- * page fails verification, then bio->bi_status is set to an error status.
+ * Verify the bio's data against the file's Merkle tree.  All bio data segments
+ * must be aligned to the file's Merkle tree block size.  If any data fails
+ * verification, then bio->bi_status is set to an error status.
  *
  * This is a helper function for use by the ->readahead() method of filesystems
  * that issue bios to read data directly into the page cache.  Filesystems that
@@ -212,15 +338,13 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page);
 void fsverity_verify_bio(struct bio *bio)
 {
 	struct inode *inode = bio_first_page_all(bio)->mapping->host;
-	const struct fsverity_info *vi = inode->i_verity_info;
-	const struct merkle_tree_params *params = &vi->tree_params;
+	struct fsverity_info *vi = inode->i_verity_info;
 	struct ahash_request *req;
-	struct bio_vec *bv;
-	struct bvec_iter_all iter_all;
+	struct folio_iter fi;
 	unsigned long max_ra_pages = 0;
 
 	/* This allocation never fails, since it's mempool-backed. */
-	req = fsverity_alloc_hash_request(params->hash_alg, GFP_NOFS);
+	req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS);
 
 	if (bio->bi_opf & REQ_RAHEAD) {
 		/*
@@ -232,24 +356,18 @@ void fsverity_verify_bio(struct bio *bio)
 		 * This improves sequential read performance, as it greatly
 		 * reduces the number of I/O requests made to the Merkle tree.
 		 */
-		bio_for_each_segment_all(bv, bio, iter_all)
-			max_ra_pages++;
-		max_ra_pages /= 4;
+		max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2);
 	}
 
-	bio_for_each_segment_all(bv, bio, iter_all) {
-		struct page *page = bv->bv_page;
-		unsigned long level0_index = page->index >> params->log_arity;
-		unsigned long level0_ra_pages =
-			min(max_ra_pages, params->level0_blocks - level0_index);
-
-		if (!verify_page(inode, vi, req, page, level0_ra_pages)) {
+	bio_for_each_folio_all(fi, bio) {
+		if (!verify_data_blocks(inode, vi, req, fi.folio, fi.length,
+					fi.offset, max_ra_pages)) {
 			bio->bi_status = BLK_STS_IOERR;
 			break;
 		}
 	}
 
-	fsverity_free_hash_request(params->hash_alg, req);
+	fsverity_free_hash_request(vi->tree_params.hash_alg, req);
 }
 EXPORT_SYMBOL_GPL(fsverity_verify_bio);
 #endif /* CONFIG_BLOCK */
author	Linus Torvalds <torvalds@linux-foundation.org>	2023-02-20 12:33:41 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2023-02-20 12:33:41 -0800
commit	6639c3ce7fd217c22b26aa9f2a3cb69dc19221f8 (patch)
tree	743eadc88bc0422c227484805f97d2b23b21fb3b /fs
parent	f18f9845f2f10d3d1fc63e4ad16ee52d2d9292fa (diff)
parent	51e4e3153ebc32d3280d5d17418ae6f1a44f1ec1 (diff)