summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig138
-rw-r--r--fs/Makefile1
-rw-r--r--fs/bio-integrity.c719
-rw-r--r--fs/bio.c88
-rw-r--r--fs/buffer.c21
-rw-r--r--fs/char_dev.c7
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/compat_ioctl.c6
-rw-r--r--fs/dlm/user.c9
-rw-r--r--fs/ecryptfs/file.c3
-rw-r--r--fs/ext4/balloc.c209
-rw-r--r--fs/ext4/dir.c17
-rw-r--r--fs/ext4/ext4.h61
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h10
-rw-r--r--fs/ext4/ext4_jbd2.h21
-rw-r--r--fs/ext4/ext4_sb.h5
-rw-r--r--fs/ext4/extents.c111
-rw-r--r--fs/ext4/file.c20
-rw-r--r--fs/ext4/fsync.c4
-rw-r--r--fs/ext4/group.h2
-rw-r--r--fs/ext4/ialloc.c113
-rw-r--r--fs/ext4/inode.c1591
-rw-r--r--fs/ext4/mballoc.c451
-rw-r--r--fs/ext4/namei.c45
-rw-r--r--fs/ext4/resize.c52
-rw-r--r--fs/ext4/super.c142
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/ext4/xattr_trusted.c4
-rw-r--r--fs/ext4/xattr_user.c4
-rw-r--r--fs/fat/cache.c2
-rw-r--r--fs/fat/dir.c4
-rw-r--r--fs/fat/file.c6
-rw-r--r--fs/fat/inode.c26
-rw-r--r--fs/fcntl.c3
-rw-r--r--fs/gfs2/Kconfig18
-rw-r--r--fs/gfs2/Makefile1
-rw-r--r--fs/gfs2/gfs2.h5
-rw-r--r--fs/gfs2/glock.c1643
-rw-r--r--fs/gfs2/glock.h11
-rw-r--r--fs/gfs2/glops.c70
-rw-r--r--fs/gfs2/incore.h38
-rw-r--r--fs/gfs2/inode.c11
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/locking.c52
-rw-r--r--fs/gfs2/locking/dlm/lock.c368
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h18
-rw-r--r--fs/gfs2/locking/dlm/mount.c14
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c13
-rw-r--r--fs/gfs2/locking/dlm/thread.c331
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c238
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/meta_io.c14
-rw-r--r--fs/gfs2/meta_io.h1
-rw-r--r--fs/gfs2/ops_address.c40
-rw-r--r--fs/gfs2/ops_file.c42
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/ops_inode.c25
-rw-r--r--fs/gfs2/ops_super.c4
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/recovery.c5
-rw-r--r--fs/gfs2/rgrp.c108
-rw-r--r--fs/gfs2/super.c4
-rw-r--r--fs/gfs2/sys.c16
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/jbd2/commit.c294
-rw-r--r--fs/jbd2/journal.c53
-rw-r--r--fs/jbd2/transaction.c365
-rw-r--r--fs/jfs/jfs_debug.c62
-rw-r--r--fs/jfs/jfs_debug.h10
-rw-r--r--fs/jfs/jfs_dtree.h3
-rw-r--r--fs/jfs/jfs_imap.c2
-rw-r--r--fs/jfs/jfs_logmgr.c35
-rw-r--r--fs/jfs/jfs_metapage.c36
-rw-r--r--fs/jfs/jfs_txnmgr.c68
-rw-r--r--fs/jfs/jfs_xtree.c36
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/super.c7
-rw-r--r--fs/lockd/clntproc.c8
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svclock.c7
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/mpage.c14
-rw-r--r--fs/msdos/namei.c35
-rw-r--r--fs/namespace.c14
-rw-r--r--fs/ncpfs/file.c12
-rw-r--r--fs/nfs/callback.c34
-rw-r--r--fs/nfs/client.c13
-rw-r--r--fs/nfs/dir.c88
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/file.c161
-rw-r--r--fs/nfs/inode.c79
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/iostat.h119
-rw-r--r--fs/nfs/nfs3acl.c9
-rw-r--r--fs/nfs/nfs3proc.c275
-rw-r--r--fs/nfs/nfs4proc.c265
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfsroot.c10
-rw-r--r--fs/nfs/proc.c28
-rw-r--r--fs/nfs/super.c882
-rw-r--r--fs/nfs/write.c322
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/ocfs2/stack_user.c3
-rw-r--r--fs/proc/base.c9
-rw-r--r--fs/proc/proc_misc.c16
-rw-r--r--fs/proc/task_mmu.c6
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/ramfs/file-mmu.c1
-rw-r--r--fs/ramfs/file-nommu.c1
-rw-r--r--fs/read_write.c38
-rw-r--r--fs/smbfs/file.c11
-rw-r--r--fs/splice.c17
-rw-r--r--fs/vfat/namei.c35
117 files changed, 6266 insertions, 4246 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index a52cf6280b4b..17216ba99c85 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -930,7 +930,7 @@ config PROC_KCORE
config PROC_VMCORE
bool "/proc/vmcore support (EXPERIMENTAL)"
- depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
+ depends on PROC_FS && CRASH_DUMP
default y
help
Exports the dump image of crashed kernel in ELF format.
@@ -1547,10 +1547,6 @@ config UFS_FS
The recently released UFS2 variant (used in FreeBSD 5.x) is
READ-ONLY supported.
- If you only intend to mount files from some other Unix over the
- network using NFS, you don't need the UFS file system support (but
- you need NFS file system support obviously).
-
Note that this option is generally not needed for floppies, since a
good portable way to transport files and directories between unixes
(and even other operating systems) is given by the tar program ("man
@@ -1590,6 +1586,7 @@ menuconfig NETWORK_FILESYSTEMS
Say Y here to get to see options for network filesystems and
filesystem-related networking code, such as NFS daemon and
RPCSEC security modules.
+
This option alone does not add any kernel code.
If you say N, all options in this submenu will be skipped and
@@ -1598,76 +1595,92 @@ menuconfig NETWORK_FILESYSTEMS
if NETWORK_FILESYSTEMS
config NFS_FS
- tristate "NFS file system support"
+ tristate "NFS client support"
depends on INET
select LOCKD
select SUNRPC
select NFS_ACL_SUPPORT if NFS_V3_ACL
help
- If you are connected to some other (usually local) Unix computer
- (using SLIP, PLIP, PPP or Ethernet) and want to mount files residing
- on that computer (the NFS server) using the Network File Sharing
- protocol, say Y. "Mounting files" means that the client can access
- the files with usual UNIX commands as if they were sitting on the
- client's hard disk. For this to work, the server must run the
- programs nfsd and mountd (but does not need to have NFS file system
- support enabled in its kernel). NFS is explained in the Network
- Administrator's Guide, available from
- <http://www.tldp.org/docs.html#guide>, on its man page: "man
- nfs", and in the NFS-HOWTO.
-
- A superior but less widely used alternative to NFS is provided by
- the Coda file system; see "Coda file system support" below.
+ Choose Y here if you want to access files residing on other
+ computers using Sun's Network File System protocol. To compile
+ this file system support as a module, choose M here: the module
+ will be called nfs.
- If you say Y here, you should have said Y to TCP/IP networking also.
- This option would enlarge your kernel by about 27 KB.
+ To mount file systems exported by NFS servers, you also need to
+ install the user space mount.nfs command which can be found in
+ the Linux nfs-utils package, available from http://linux-nfs.org/.
+ Information about using the mount command is available in the
+ mount(8) man page. More detail about the Linux NFS client
+ implementation is available via the nfs(5) man page.
- To compile this file system support as a module, choose M here: the
- module will be called nfs.
+ Below you can choose which versions of the NFS protocol are
+ available in the kernel to mount NFS servers. Support for NFS
+ version 2 (RFC 1094) is always available when NFS_FS is selected.
- If you are configuring a diskless machine which will mount its root
- file system over NFS at boot time, say Y here and to "Kernel
- level IP autoconfiguration" above and to "Root file system on NFS"
- below. You cannot compile this driver as a module in this case.
- There are two packages designed for booting diskless machines over
- the net: netboot, available from
- <http://ftp1.sourceforge.net/netboot/>, and Etherboot,
- available from <http://ftp1.sourceforge.net/etherboot/>.
+ To configure a system which mounts its root file system via NFS
+ at boot time, say Y here, select "Kernel level IP
+ autoconfiguration" in the NETWORK menu, and select "Root file
+ system on NFS" below. You cannot compile this file system as a
+ module in this case.
- If you don't know what all this is about, say N.
+ If unsure, say N.
config NFS_V3
- bool "Provide NFSv3 client support"
+ bool "NFS client support for NFS version 3"
depends on NFS_FS
help
- Say Y here if you want your NFS client to be able to speak version
- 3 of the NFS protocol.
+ This option enables support for version 3 of the NFS protocol
+ (RFC 1813) in the kernel's NFS client.
If unsure, say Y.
config NFS_V3_ACL
- bool "Provide client support for the NFSv3 ACL protocol extension"
+ bool "NFS client support for the NFSv3 ACL protocol extension"
depends on NFS_V3
help
- Implement the NFSv3 ACL protocol extension for manipulating POSIX
- Access Control Lists. The server should also be compiled with
- the NFSv3 ACL protocol extension; see the CONFIG_NFSD_V3_ACL option.
+ Some NFS servers support an auxiliary NFSv3 ACL protocol that
+ Sun added to Solaris but never became an official part of the
+ NFS version 3 protocol. This protocol extension allows
+ applications on NFS clients to manipulate POSIX Access Control
+ Lists on files residing on NFS servers. NFS servers enforce
+ ACLs on local files whether this protocol is available or not.
+
+ Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+ protocol extension and you want your NFS client to allow
+ applications to access and modify ACLs on files on the server.
+
+ Most NFS servers don't support the Solaris NFSv3 ACL protocol
+ extension. You can choose N here or specify the "noacl" mount
+ option to prevent your NFS client from trying to use the NFSv3
+ ACL protocol.
If unsure, say N.
config NFS_V4
- bool "Provide NFSv4 client support (EXPERIMENTAL)"
+ bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
depends on NFS_FS && EXPERIMENTAL
select RPCSEC_GSS_KRB5
help
- Say Y here if you want your NFS client to be able to speak the newer
- version 4 of the NFS protocol.
+ This option enables support for version 4 of the NFS protocol
+ (RFC 3530) in the kernel's NFS client.
- Note: Requires auxiliary userspace daemons which may be found on
- http://www.citi.umich.edu/projects/nfsv4/
+ To mount NFS servers using NFSv4, you also need to install user
+ space programs which can be found in the Linux nfs-utils package,
+ available from http://linux-nfs.org/.
If unsure, say N.
+config ROOT_NFS
+ bool "Root file system on NFS"
+ depends on NFS_FS=y && IP_PNP
+ help
+ If you want your system to mount its root file system via NFS,
+ choose Y here. This is common practice for managing systems
+ without local permanent storage. For details, read
+ <file:Documentation/filesystems/nfsroot.txt>.
+
+ Most people say N here.
+
config NFSD
tristate "NFS server support"
depends on INET
@@ -1749,20 +1762,6 @@ config NFSD_V4
If unsure, say N.
-config ROOT_NFS
- bool "Root file system on NFS"
- depends on NFS_FS=y && IP_PNP
- help
- If you want your Linux box to mount its whole root file system (the
- one containing the directory /) from some other computer over the
- net via NFS (presumably because your box doesn't have a hard disk),
- say Y. Read <file:Documentation/filesystems/nfsroot.txt> for
- details. It is likely that in this case, you also want to say Y to
- "Kernel level IP autoconfiguration" so that your box can discover
- its network address at boot time.
-
- Most people say N here.
-
config LOCKD
tristate
@@ -1803,27 +1802,6 @@ config SUNRPC_XPRT_RDMA
If unsure, say N.
-config SUNRPC_BIND34
- bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
- depends on SUNRPC && EXPERIMENTAL
- default n
- help
- RPC requests over IPv6 networks require support for larger
- addresses when performing an RPC bind. Sun added support for
- IPv6 addressing by creating two new versions of the rpcbind
- protocol (RFC 1833).
-
- This option enables support in the kernel RPC client for
- querying rpcbind servers via versions 3 and 4 of the rpcbind
- protocol. The kernel automatically falls back to version 2
- if a remote rpcbind service does not support versions 3 or 4.
- By themselves, these new versions do not provide support for
- RPC over IPv6, but the new protocol versions are necessary to
- support it.
-
- If unsure, say N to get traditional behavior (version 2 rpcbind
- requests only).
-
config RPCSEC_GSS_KRB5
tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
depends on SUNRPC && EXPERIMENTAL
diff --git a/fs/Makefile b/fs/Makefile
index fcae06aaadc5..3b2178b4bb66 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -19,6 +19,7 @@ else
obj-y += no-block.o
endif
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
obj-$(CONFIG_EPOLL) += eventpoll.o
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 000000000000..63e2ee63058d
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,719 @@
+/*
+ * bio-integrity.c - bio data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+
+static struct kmem_cache *bio_integrity_slab __read_mostly;
+static struct workqueue_struct *kintegrityd_wq;
+
+/**
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * @bio: bio to attach integrity metadata to
+ * @gfp_mask: Memory allocation mask
+ * @nr_vecs: Number of integrity metadata scatter-gather elements
+ * @bs: bio_set to allocate from
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata. nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
+ gfp_t gfp_mask,
+ unsigned int nr_vecs,
+ struct bio_set *bs)
+{
+ struct bio_integrity_payload *bip;
+ struct bio_vec *iv;
+ unsigned long idx;
+
+ BUG_ON(bio == NULL);
+
+ bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+ if (unlikely(bip == NULL)) {
+ printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+ return NULL;
+ }
+
+ memset(bip, 0, sizeof(*bip));
+
+ iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+ if (unlikely(iv == NULL)) {
+ printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+ mempool_free(bip, bs->bio_integrity_pool);
+ return NULL;
+ }
+
+ bip->bip_pool = idx;
+ bip->bip_vec = iv;
+ bip->bip_bio = bio;
+ bio->bi_integrity = bip;
+
+ return bip;
+}
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio: bio to attach integrity metadata to
+ * @gfp_mask: Memory allocation mask
+ * @nr_vecs: Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata. nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+ gfp_t gfp_mask,
+ unsigned int nr_vecs)
+{
+ return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
+EXPORT_SYMBOL(bio_integrity_alloc);
+
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio: bio containing bip to be freed
+ * @bs: bio_set this bio was allocated from
+ *
+ * Description: Used to free the integrity portion of a bio. Usually
+ * called from bio_free().
+ */
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+
+ BUG_ON(bip == NULL);
+
+ /* A cloned bio doesn't own the integrity metadata */
+ if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+ kfree(bip->bip_buf);
+
+ mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+ mempool_free(bip, bs->bio_integrity_pool);
+
+ bio->bi_integrity = NULL;
+}
+EXPORT_SYMBOL(bio_integrity_free);
+
+/**
+ * bio_integrity_add_page - Attach integrity metadata
+ * @bio: bio to update
+ * @page: page containing integrity metadata
+ * @len: number of bytes of integrity metadata in page
+ * @offset: start offset within page
+ *
+ * Description: Attach a page containing integrity metadata to bio.
+ */
+int bio_integrity_add_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct bio_vec *iv;
+
+ if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+ printk(KERN_ERR "%s: bip_vec full\n", __func__);
+ return 0;
+ }
+
+ iv = bip_vec_idx(bip, bip->bip_vcnt);
+ BUG_ON(iv == NULL);
+ BUG_ON(iv->bv_page != NULL);
+
+ iv->bv_page = page;
+ iv->bv_len = len;
+ iv->bv_offset = offset;
+ bip->bip_vcnt++;
+
+ return len;
+}
+EXPORT_SYMBOL(bio_integrity_add_page);
+
+/**
+ * bio_integrity_enabled - Check whether integrity can be passed
+ * @bio: bio to check
+ *
+ * Description: Determines whether bio_integrity_prep() can be called
+ * on this bio or not. bio data direction and target device must be
+ * set prior to calling. The functions honors the write_generate and
+ * read_verify flags in sysfs.
+ */
+int bio_integrity_enabled(struct bio *bio)
+{
+ /* Already protected? */
+ if (bio_integrity(bio))
+ return 0;
+
+ return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
+}
+EXPORT_SYMBOL(bio_integrity_enabled);
+
+/**
+ * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
+ * @bi: blk_integrity profile for device
+ * @sectors: Number of 512 sectors to convert
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the hardware
+ * sector size of the storage device. Convert the block layer sectors
+ * to physical sectors.
+ */
+static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
+ unsigned int sectors)
+{
+ /* At this point there are only 512b or 4096b DIF/EPP devices */
+ if (bi->sector_size == 4096)
+ return sectors >>= 3;
+
+ return sectors;
+}
+
+/**
+ * bio_integrity_tag_size - Retrieve integrity tag space
+ * @bio: bio to inspect
+ *
+ * Description: Returns the maximum number of tag bytes that can be
+ * attached to this bio. Filesystems can use this to determine how
+ * much metadata to attach to an I/O.
+ */
+unsigned int bio_integrity_tag_size(struct bio *bio)
+{
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+
+ BUG_ON(bio->bi_size == 0);
+
+ return bi->tag_size * (bio->bi_size / bi->sector_size);
+}
+EXPORT_SYMBOL(bio_integrity_tag_size);
+
+int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ unsigned int nr_sectors;
+
+ BUG_ON(bip->bip_buf == NULL);
+
+ if (bi->tag_size == 0)
+ return -1;
+
+ nr_sectors = bio_integrity_hw_sectors(bi,
+ DIV_ROUND_UP(len, bi->tag_size));
+
+ if (nr_sectors * bi->tuple_size > bip->bip_size) {
+ printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
+ __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+ return -1;
+ }
+
+ if (set)
+ bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+ else
+ bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+
+ return 0;
+}
+
+/**
+ * bio_integrity_set_tag - Attach a tag buffer to a bio
+ * @bio: bio to attach buffer to
+ * @tag_buf: Pointer to a buffer containing tag data
+ * @len: Length of the included buffer
+ *
+ * Description: Use this function to tag a bio by leveraging the extra
+ * space provided by devices formatted with integrity protection. The
+ * size of the integrity buffer must be <= to the size reported by
+ * bio_integrity_tag_size().
+ */
+int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+ BUG_ON(bio_data_dir(bio) != WRITE);
+
+ return bio_integrity_tag(bio, tag_buf, len, 1);
+}
+EXPORT_SYMBOL(bio_integrity_set_tag);
+
+/**
+ * bio_integrity_get_tag - Retrieve a tag buffer from a bio
+ * @bio: bio to retrieve buffer from
+ * @tag_buf: Pointer to a buffer for the tag data
+ * @len: Length of the target buffer
+ *
+ * Description: Use this function to retrieve the tag buffer from a
+ * completed I/O. The size of the integrity buffer must be <= to the
+ * size reported by bio_integrity_tag_size().
+ */
+int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+ BUG_ON(bio_data_dir(bio) != READ);
+
+ return bio_integrity_tag(bio, tag_buf, len, 0);
+}
+EXPORT_SYMBOL(bio_integrity_get_tag);
+
+/**
+ * bio_integrity_generate - Generate integrity metadata for a bio
+ * @bio: bio to generate integrity metadata for
+ *
+ * Description: Generates integrity metadata for a bio by calling the
+ * block device's generation callback function. The bio must have a
+ * bip attached with enough room to accommodate the generated
+ * integrity metadata.
+ */
+static void bio_integrity_generate(struct bio *bio)
+{
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct blk_integrity_exchg bix;
+ struct bio_vec *bv;
+ sector_t sector = bio->bi_sector;
+ unsigned int i, sectors, total;
+ void *prot_buf = bio->bi_integrity->bip_buf;
+
+ total = 0;
+ bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+ bix.sector_size = bi->sector_size;
+
+ bio_for_each_segment(bv, bio, i) {
+ void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+ bix.data_buf = kaddr + bv->bv_offset;
+ bix.data_size = bv->bv_len;
+ bix.prot_buf = prot_buf;
+ bix.sector = sector;
+
+ bi->generate_fn(&bix);
+
+ sectors = bv->bv_len / bi->sector_size;
+ sector += sectors;
+ prot_buf += sectors * bi->tuple_size;
+ total += sectors * bi->tuple_size;
+ BUG_ON(total > bio->bi_integrity->bip_size);
+
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+}
+
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio: bio to prepare
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio. The bio must have data
+ * direction, target device and start sector set priot to calling. In
+ * the WRITE case, integrity metadata will be generated using the
+ * block device's integrity function. In the READ case, the buffer
+ * will be prepared for DMA and a suitable end_io handler set up.
+ */
+int bio_integrity_prep(struct bio *bio)
+{
+ struct bio_integrity_payload *bip;
+ struct blk_integrity *bi;
+ struct request_queue *q;
+ void *buf;
+ unsigned long start, end;
+ unsigned int len, nr_pages;
+ unsigned int bytes, offset, i;
+ unsigned int sectors;
+
+ bi = bdev_get_integrity(bio->bi_bdev);
+ q = bdev_get_queue(bio->bi_bdev);
+ BUG_ON(bi == NULL);
+ BUG_ON(bio_integrity(bio));
+
+ sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
+
+ /* Allocate kernel buffer for protection data */
+ len = sectors * blk_integrity_tuple_size(bi);
+ buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+ if (unlikely(buf == NULL)) {
+ printk(KERN_ERR "could not allocate integrity buffer\n");
+ return -EIO;
+ }
+
+ end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = ((unsigned long) buf) >> PAGE_SHIFT;
+ nr_pages = end - start;
+
+ /* Allocate bio integrity payload and integrity vectors */
+ bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+ if (unlikely(bip == NULL)) {
+ printk(KERN_ERR "could not allocate data integrity bioset\n");
+ kfree(buf);
+ return -EIO;
+ }
+
+ bip->bip_buf = buf;
+ bip->bip_size = len;
+ bip->bip_sector = bio->bi_sector;
+
+ /* Map it */
+ offset = offset_in_page(buf);
+ for (i = 0 ; i < nr_pages ; i++) {
+ int ret;
+ bytes = PAGE_SIZE - offset;
+
+ if (len <= 0)
+ break;
+
+ if (bytes > len)
+ bytes = len;
+
+ ret = bio_integrity_add_page(bio, virt_to_page(buf),
+ bytes, offset);
+
+ if (ret == 0)
+ return 0;
+
+ if (ret < bytes)
+ break;
+
+ buf += bytes;
+ len -= bytes;
+ offset = 0;
+ }
+
+ /* Install custom I/O completion handler if read verify is enabled */
+ if (bio_data_dir(bio) == READ) {
+ bip->bip_end_io = bio->bi_end_io;
+ bio->bi_end_io = bio_integrity_endio;
+ }
+
+ /* Auto-generate integrity metadata if this is a write */
+ if (bio_data_dir(bio) == WRITE)
+ bio_integrity_generate(bio);
+
+ return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+
+/**
+ * bio_integrity_verify - Verify integrity metadata for a bio
+ * @bio: bio to verify
+ *
+ * Description: This function is called to verify the integrity of a
+ * bio. The data in the bio io_vec is compared to the integrity
+ * metadata returned by the HBA.
+ */
+static int bio_integrity_verify(struct bio *bio)
+{
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct blk_integrity_exchg bix;
+ struct bio_vec *bv;
+ sector_t sector = bio->bi_integrity->bip_sector;
+ unsigned int i, sectors, total, ret;
+ void *prot_buf = bio->bi_integrity->bip_buf;
+
+ ret = total = 0;
+ bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+ bix.sector_size = bi->sector_size;
+
+ bio_for_each_segment(bv, bio, i) {
+ void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+ bix.data_buf = kaddr + bv->bv_offset;
+ bix.data_size = bv->bv_len;
+ bix.prot_buf = prot_buf;
+ bix.sector = sector;
+
+ ret = bi->verify_fn(&bix);
+
+ if (ret) {
+ kunmap_atomic(kaddr, KM_USER0);
+ break;
+ }
+
+ sectors = bv->bv_len / bi->sector_size;
+ sector += sectors;
+ prot_buf += sectors * bi->tuple_size;
+ total += sectors * bi->tuple_size;
+ BUG_ON(total > bio->bi_integrity->bip_size);
+
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+
+ return ret;
+}
+
+/**
+ * bio_integrity_verify_fn - Integrity I/O completion worker
+ * @work: Work struct stored in bio to be verified
+ *
+ * Description: This workqueue function is called to complete a READ
+ * request. The function verifies the transferred integrity metadata
+ * and then calls the original bio end_io function.
+ */
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+ struct bio_integrity_payload *bip =
+ container_of(work, struct bio_integrity_payload, bip_work);
+ struct bio *bio = bip->bip_bio;
+ int error = bip->bip_error;
+
+ if (bio_integrity_verify(bio)) {
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ error = -EIO;
+ }
+
+ /* Restore original bio completion handler */
+ bio->bi_end_io = bip->bip_end_io;
+
+ if (bio->bi_end_io)
+ bio->bi_end_io(bio, error);
+}
+
+/**
+ * bio_integrity_endio - Integrity I/O completion function
+ * @bio: Protected bio
+ * @error: Pointer to errno
+ *
+ * Description: Completion for integrity I/O
+ *
+ * Normally I/O completion is done in interrupt context. However,
+ * verifying I/O integrity is a time-consuming task which must be run
+ * in process context. This function postpones completion
+ * accordingly.
+ */
+void bio_integrity_endio(struct bio *bio, int error)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+
+ BUG_ON(bip->bip_bio != bio);
+
+ bip->bip_error = error;
+ INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+ queue_work(kintegrityd_wq, &bip->bip_work);
+}
+EXPORT_SYMBOL(bio_integrity_endio);
+
+/**
+ * bio_integrity_mark_head - Advance bip_vec skip bytes
+ * @bip: Integrity vector to advance
+ * @skip: Number of bytes to advance it
+ */
+void bio_integrity_mark_head(struct bio_integrity_payload *bip,
+ unsigned int skip)
+{
+ struct bio_vec *iv;
+ unsigned int i;
+
+ bip_for_each_vec(iv, bip, i) {
+ if (skip == 0) {
+ bip->bip_idx = i;
+ return;
+ } else if (skip >= iv->bv_len) {
+ skip -= iv->bv_len;
+ } else { /* skip < iv->bv_len) */
+ iv->bv_offset += skip;
+ iv->bv_len -= skip;
+ bip->bip_idx = i;
+ return;
+ }
+ }
+}
+
+/**
+ * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
+ * @bip: Integrity vector to truncate
+ * @len: New length of integrity vector
+ */
+void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
+ unsigned int len)
+{
+ struct bio_vec *iv;
+ unsigned int i;
+
+ bip_for_each_vec(iv, bip, i) {
+ if (len == 0) {
+ bip->bip_vcnt = i;
+ return;
+ } else if (len >= iv->bv_len) {
+ len -= iv->bv_len;
+ } else { /* len < iv->bv_len) */
+ iv->bv_len = len;
+ len = 0;
+ }
+ }
+}
+
+/**
+ * bio_integrity_advance - Advance integrity vector
+ * @bio: bio whose integrity vector to update
+ * @bytes_done: number of data bytes that have been completed
+ *
+ * Description: This function calculates how many integrity bytes the
+ * number of completed data bytes correspond to and advances the
+ * integrity vector accordingly.
+ */
+void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ unsigned int nr_sectors;
+
+ BUG_ON(bip == NULL);
+ BUG_ON(bi == NULL);
+
+ nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
+ bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_advance);
+
+/**
+ * bio_integrity_trim - Trim integrity vector
+ * @bio: bio whose integrity vector to update
+ * @offset: offset to first data sector
+ * @sectors: number of data sectors
+ *
+ * Description: Used to trim the integrity vector in a cloned bio.
+ * The ivec will be advanced corresponding to 'offset' data sectors
+ * and the length will be truncated corresponding to 'len' data
+ * sectors.
+ */
+void bio_integrity_trim(struct bio *bio, unsigned int offset,
+ unsigned int sectors)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ unsigned int nr_sectors;
+
+ BUG_ON(bip == NULL);
+ BUG_ON(bi == NULL);
+ BUG_ON(!bio_flagged(bio, BIO_CLONED));
+
+ nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+ bip->bip_sector = bip->bip_sector + offset;
+ bio_integrity_mark_head(bip, offset * bi->tuple_size);
+ bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_trim);
+
+/**
+ * bio_integrity_split - Split integrity metadata
+ * @bio: Protected bio
+ * @bp: Resulting bio_pair
+ * @sectors: Offset
+ *
+ * Description: Splits an integrity page into a bio_pair.
+ */
+void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
+{
+ struct blk_integrity *bi;
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ unsigned int nr_sectors;
+
+ if (bio_integrity(bio) == 0)
+ return;
+
+ bi = bdev_get_integrity(bio->bi_bdev);
+ BUG_ON(bi == NULL);
+ BUG_ON(bip->bip_vcnt != 1);
+
+ nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+
+ bp->bio1.bi_integrity = &bp->bip1;
+ bp->bio2.bi_integrity = &bp->bip2;
+
+ bp->iv1 = bip->bip_vec[0];
+ bp->iv2 = bip->bip_vec[0];
+
+ bp->bip1.bip_vec = &bp->iv1;
+ bp->bip2.bip_vec = &bp->iv2;
+
+ bp->iv1.bv_len = sectors * bi->tuple_size;
+ bp->iv2.bv_offset += sectors * bi->tuple_size;
+ bp->iv2.bv_len -= sectors * bi->tuple_size;
+
+ bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
+ bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
+
+ bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
+ bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
+}
+EXPORT_SYMBOL(bio_integrity_split);
+
+/**
+ * bio_integrity_clone - Callback for cloning bios with integrity metadata
+ * @bio: New bio
+ * @bio_src: Original bio
+ * @bs: bio_set to allocate bip from
+ *
+ * Description: Called to allocate a bip when cloning a bio
+ */
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+ struct bio_set *bs)
+{
+ struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
+ struct bio_integrity_payload *bip;
+
+ BUG_ON(bip_src == NULL);
+
+ bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+
+ if (bip == NULL)
+ return -EIO;
+
+ memcpy(bip->bip_vec, bip_src->bip_vec,
+ bip_src->bip_vcnt * sizeof(struct bio_vec));
+
+ bip->bip_sector = bip_src->bip_sector;
+ bip->bip_vcnt = bip_src->bip_vcnt;
+ bip->bip_idx = bip_src->bip_idx;
+
+ return 0;
+}
+EXPORT_SYMBOL(bio_integrity_clone);
+
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+ bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
+ bio_integrity_slab);
+ if (!bs->bio_integrity_pool)
+ return -1;
+
+ return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+
+void bioset_integrity_free(struct bio_set *bs)
+{
+ if (bs->bio_integrity_pool)
+ mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+
+void __init bio_integrity_init_slab(void)
+{
+ bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+}
+EXPORT_SYMBOL(bio_integrity_init_slab);
+
+static int __init integrity_init(void)
+{
+ kintegrityd_wq = create_workqueue("kintegrityd");
+
+ if (!kintegrityd_wq)
+ panic("Failed to create kintegrityd\n");
+
+ return 0;
+}
+subsys_initcall(integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 78562574cb52..88322b066acb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,25 +28,10 @@
#include <linux/blktrace_api.h>
#include <scsi/sg.h> /* for struct sg_iovec */
-#define BIO_POOL_SIZE 2
-
static struct kmem_cache *bio_slab __read_mostly;
-#define BIOVEC_NR_POOLS 6
-
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
mempool_t *bio_split_pool __read_mostly;
-struct biovec_slab {
- int nr_vecs;
- char *name;
- struct kmem_cache *slab;
-};
-
/*
* if you change this list, also change bvec_alloc or things will
* break badly! cannot be bigger than what you can fit into an
@@ -60,23 +45,17 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
#undef BV
/*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-struct bio_set {
- mempool_t *bio_pool;
- mempool_t *bvec_pools[BIOVEC_NR_POOLS];
-};
-
-/*
* fs_bio_set is the bio_set containing bio and iovec memory pools used by
* IO code that does not need private memory pools.
*/
-static struct bio_set *fs_bio_set;
+struct bio_set *fs_bio_set;
+
+unsigned int bvec_nr_vecs(unsigned short idx)
+{
+ return bvec_slabs[idx].nr_vecs;
+}
-static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
{
struct bio_vec *bvl;
@@ -117,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
}
+ if (bio_integrity(bio))
+ bio_integrity_free(bio, bio_set);
+
mempool_free(bio, bio_set->bio_pool);
}
@@ -275,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
{
struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
- if (b) {
- b->bi_destructor = bio_fs_destructor;
- __bio_clone(b, bio);
+ if (!b)
+ return NULL;
+
+ b->bi_destructor = bio_fs_destructor;
+ __bio_clone(b, bio);
+
+ if (bio_integrity(bio)) {
+ int ret;
+
+ ret = bio_integrity_clone(b, bio, fs_bio_set);
+
+ if (ret < 0)
+ return NULL;
}
return b;
@@ -333,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
if (page == prev->bv_page &&
offset == prev->bv_offset + prev->bv_len) {
prev->bv_len += len;
- if (q->merge_bvec_fn &&
- q->merge_bvec_fn(q, bio, prev) < len) {
- prev->bv_len -= len;
- return 0;
+
+ if (q->merge_bvec_fn) {
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bio->bi_bdev,
+ .bi_sector = bio->bi_sector,
+ .bi_size = bio->bi_size,
+ .bi_rw = bio->bi_rw,
+ };
+
+ if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+ prev->bv_len -= len;
+ return 0;
+ }
}
goto done;
@@ -377,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
* queue to get further control
*/
if (q->merge_bvec_fn) {
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bio->bi_bdev,
+ .bi_sector = bio->bi_sector,
+ .bi_size = bio->bi_size,
+ .bi_rw = bio->bi_rw,
+ };
+
/*
* merge_bvec_fn() returns number of bytes it can accept
* at this offset
*/
- if (q->merge_bvec_fn(q, bio, bvec) < len) {
+ if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
bvec->bv_page = NULL;
bvec->bv_len = 0;
bvec->bv_offset = 0;
@@ -1249,6 +1257,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
bp->bio1.bi_private = bi;
bp->bio2.bi_private = pool;
+ if (bio_integrity(bi))
+ bio_integrity_split(bi, bp, first_sectors);
+
return bp;
}
@@ -1290,6 +1301,7 @@ void bioset_free(struct bio_set *bs)
if (bs->bio_pool)
mempool_destroy(bs->bio_pool);
+ bioset_integrity_free(bs);
biovec_free_pools(bs);
kfree(bs);
@@ -1306,6 +1318,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
if (!bs->bio_pool)
goto bad;
+ if (bioset_integrity_create(bs, bio_pool_size))
+ goto bad;
+
if (!biovec_create_pools(bs, bvec_pool_size))
return bs;
@@ -1332,6 +1347,7 @@ static int __init init_bio(void)
{
bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+ bio_integrity_init_slab();
biovec_init_slabs();
fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
diff --git a/fs/buffer.c b/fs/buffer.c
index 0f51c0f7c266..d48caee12e2a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1464,7 +1464,7 @@ static void invalidate_bh_lru(void *arg)
void invalidate_bh_lrus(void)
{
- on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
+ on_each_cpu(invalidate_bh_lru, NULL, 1);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
- } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+ } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+ buffer_dirty(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, 1);
if (err)
goto recover;
+ clear_buffer_delay(bh);
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
bh = head;
/* Recovery: lock and submit the mapped buffers */
do {
- if (buffer_mapped(bh) && buffer_dirty(bh)) {
+ if (buffer_mapped(bh) && buffer_dirty(bh) &&
+ !buffer_delay(bh)) {
lock_buffer(bh);
mark_buffer_async_write(bh);
} else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
+ int i_size_changed = 0;
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
*/
if (pos+copied > inode->i_size) {
i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
+ i_size_changed = 1;
}
unlock_page(page);
page_cache_release(page);
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ mark_inode_dirty(inode);
+
return copied;
}
EXPORT_SYMBOL(generic_write_end);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 68e510b88457..3cb7cda3d780 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
return -ENXIO;
new = container_of(kobj, struct cdev, kobj);
spin_lock(&cdev_lock);
+ /* Check i_cdev again in case somebody beat us to it while
+ we dropped the lock. */
p = inode->i_cdev;
if (!p) {
inode->i_cdev = p = new;
@@ -392,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
cdev_put(p);
return -ENXIO;
}
- if (filp->f_op->open) {
- lock_kernel();
+ if (filp->f_op->open)
ret = filp->f_op->open(inode,filp);
- unlock_kernel();
- }
if (ret)
cdev_put(p);
return ret;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 86b4d5f405ae..22857c639df5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -612,7 +612,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
if (retval < 0)
return (loff_t)retval;
}
- return remote_llseek(file, offset, origin);
+ return generic_file_llseek_unlocked(file, offset, origin);
}
struct file_system_type cifs_fs_type = {
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 97dba0d92348..c54eaab71a19 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -69,9 +69,11 @@
#include <linux/capi.h>
#include <linux/gigaset_dev.h>
+#ifdef CONFIG_BLOCK
#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>
#include <scsi/sg.h>
+#endif
#include <asm/uaccess.h>
#include <linux/ethtool.h>
@@ -2024,6 +2026,7 @@ COMPATIBLE_IOCTL(GIO_UNISCRNMAP)
COMPATIBLE_IOCTL(PIO_UNISCRNMAP)
COMPATIBLE_IOCTL(PIO_FONTRESET)
COMPATIBLE_IOCTL(PIO_UNIMAPCLR)
+#ifdef CONFIG_BLOCK
/* Big S */
COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK)
@@ -2033,6 +2036,7 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER)
COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
+#endif
/* Big T */
COMPATIBLE_IOCTL(TUNSETNOCSUM)
COMPATIBLE_IOCTL(TUNSETDEBUG)
@@ -2103,6 +2107,7 @@ COMPATIBLE_IOCTL(SIOCGIFVLAN)
COMPATIBLE_IOCTL(SIOCSIFVLAN)
COMPATIBLE_IOCTL(SIOCBRADDBR)
COMPATIBLE_IOCTL(SIOCBRDELBR)
+#ifdef CONFIG_BLOCK
/* SG stuff */
COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
@@ -2127,6 +2132,7 @@ COMPATIBLE_IOCTL(SG_SCSI_RESET)
COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
+#endif
/* PPP stuff */
COMPATIBLE_IOCTL(PPPIOCGFLAGS)
COMPATIBLE_IOCTL(PPPIOCSFLAGS)
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebbcf38fd33b..f976f303c196 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -15,6 +15,7 @@
#include <linux/poll.h>
#include <linux/signal.h>
#include <linux/spinlock.h>
+#include <linux/smp_lock.h>
#include <linux/dlm.h>
#include <linux/dlm_device.h>
@@ -618,13 +619,17 @@ static int device_open(struct inode *inode, struct file *file)
struct dlm_user_proc *proc;
struct dlm_ls *ls;
+ lock_kernel();
ls = dlm_find_lockspace_device(iminor(inode));
- if (!ls)
+ if (!ls) {
+ unlock_kernel();
return -ENOENT;
+ }
proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
if (!proc) {
dlm_put_lockspace(ls);
+ unlock_kernel();
return -ENOMEM;
}
@@ -636,6 +641,7 @@ static int device_open(struct inode *inode, struct file *file)
spin_lock_init(&proc->locks_spin);
init_waitqueue_head(&proc->wait);
file->private_data = proc;
+ unlock_kernel();
return 0;
}
@@ -870,6 +876,7 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
static int ctl_device_open(struct inode *inode, struct file *file)
{
+ cycle_kernel_lock();
file->private_data = NULL;
return 0;
}
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2258b8f654a6..24749bf0668f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/compat.h>
#include <linux/fs_stack.h>
+#include <linux/smp_lock.h>
#include "ecryptfs_kernel.h"
/**
@@ -277,9 +278,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
int rc = 0;
struct file *lower_file = NULL;
+ lock_kernel();
lower_file = ecryptfs_file_to_lower(file);
if (lower_file->f_op && lower_file->f_op->fasync)
rc = lower_file->f_op->fasync(fd, lower_file, flag);
+ unlock_kernel();
return rc;
}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d8..495ab21b9832 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
ext4_group_t block_group)
{
ext4_group_t actual_group;
- ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
+ ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
if (actual_group == block_group)
return 1;
return 0;
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
}
} else { /* For META_BG_BLOCK_GROUPS */
- int group_rel = (block_group -
- le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
- EXT4_DESC_PER_BLOCK(sb);
- if (group_rel == 0 || group_rel == 1 ||
- (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
- bit_max += 1;
+ bit_max += ext4_bg_num_gdb(sb, block_group);
}
if (block_group == sbi->s_groups_count - 1) {
@@ -295,7 +290,7 @@ err_out:
return 0;
}
/**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
* @sb: super block
* @block_group: given block group
*
@@ -305,7 +300,7 @@ err_out:
* Return buffer_head on success or NULL in case of failure.
*/
struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc * desc;
struct buffer_head * bh = NULL;
@@ -409,8 +404,7 @@ restart:
prev = rsv;
}
printk("Window map complete.\n");
- if (bad)
- BUG();
+ BUG_ON(bad);
}
#define rsv_window_dump(root, verbose) \
__rsv_window_dump((root), (verbose), __func__)
@@ -694,7 +688,7 @@ do_more:
count -= overflow;
}
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -810,6 +804,13 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1598,23 +1599,35 @@ out:
/**
* ext4_has_free_blocks()
- * @sbi: in-core super block structure.
+ * @sbi: in-core super block structure.
+ * @nblocks: number of neeed blocks
*
- * Check if filesystem has at least 1 free block available for allocation.
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
*/
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks)
{
- ext4_fsblk_t free_blocks, root_blocks;
+ ext4_fsblk_t free_blocks;
+ ext4_fsblk_t root_blocks = 0;
free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- root_blocks = ext4_r_blocks_count(sbi->s_es);
- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+
+ if (!capable(CAP_SYS_RESOURCE) &&
sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
- return 0;
- }
- return 1;
-}
+ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
+#ifdef CONFIG_SMP
+ if (free_blocks - root_blocks < FBC_BATCH)
+ free_blocks =
+ percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+#endif
+ if (free_blocks - root_blocks < nblocks)
+ return free_blocks - root_blocks;
+ return nblocks;
+ }
+
/**
* ext4_should_retry_alloc()
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+ if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
return 0;
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
}
/**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
* @count: target number of blocks to allocate
* @errp: error code
*
- * ext4_new_blocks uses a goal block to assist allocation. It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
+ * the block bitmap directly to do block allocation. It tries to
+ * allocate block(s) from the block group contains the goal block first. If
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
*
*/
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
struct buffer_head *bitmap_bh = NULL;
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
ext4_group_t ngroups;
unsigned long num = *count;
- *errp = -ENOSPC;
sb = inode->i_sb;
if (!sb) {
+ *errp = -ENODEV;
printk("ext4_new_block: nonexistent device");
return 0;
}
+ sbi = EXT4_SB(sb);
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
+ /*
+ * With delalloc we already reserved the blocks
+ */
+ *count = ext4_has_free_blocks(sbi, *count);
+ }
+ if (*count == 0) {
+ *errp = -ENOSPC;
+ return 0; /*return with ENOSPC error */
+ }
+ num = *count;
+
/*
* Check quota for allocation of this block.
*/
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
my_rsv = &block_i->rsv_window_node;
- if (!ext4_has_free_blocks(sbi)) {
- *errp = -ENOSPC;
- goto out;
- }
-
/*
* First, test whether the goal block is free.
*/
@@ -1734,7 +1759,7 @@ retry_alloc:
my_rsv = NULL;
if (free_blocks > 0) {
- bitmap_bh = read_block_bitmap(sb, group_no);
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1770,7 +1795,7 @@ retry_alloc:
continue;
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, group_no);
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
/*
@@ -1882,7 +1907,15 @@ allocated:
le16_add_cpu(&gdp->bg_free_blocks_count, -num);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
spin_unlock(sb_bgl_lock(sbi, group_no));
- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+ percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= num;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1915,46 +1948,104 @@ out:
return 0;
}
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int *errp)
+#define EXT4_META_BLOCK 0x1
+
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp, int flags)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
if (!test_opt(inode->i_sb, MBALLOC)) {
- unsigned long count = 1;
- ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
- return ret;
+ return ext4_old_new_blocks(handle, inode, goal, count, errp);
}
memset(&ar, 0, sizeof(ar));
+ /* Fill with neighbour allocated blocks */
+
ar.inode = inode;
ar.goal = goal;
- ar.len = 1;
+ ar.len = *count;
+ ar.logical = iblock;
+
+ if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+ /* enable in-core preallocation for data block allocation */
+ ar.flags = EXT4_MB_HINT_DATA;
+ else
+ /* disable in-core preallocation for non-regular files */
+ ar.flags = 0;
+
ret = ext4_mb_new_blocks(handle, &ar, errp);
+ *count = ar.len;
return ret;
}
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
- struct ext4_allocation_request ar;
ext4_fsblk_t ret;
-
- if (!test_opt(inode->i_sb, MBALLOC)) {
- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
- return ret;
+ ret = do_blk_alloc(handle, inode, 0, goal,
+ count, errp, EXT4_META_BLOCK);
+ /*
+ * Account for the allocated meta blocks
+ */
+ if (!(*errp)) {
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ EXT4_I(inode)->i_allocated_meta_blocks += *count;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}
-
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = *count;
- ret = ext4_mb_new_blocks(handle, &ar, errp);
- *count = ar.len;
return ret;
}
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @errp: error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, int *errp)
+{
+ unsigned long count = 1;
+ return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
+}
+
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp)
+{
+ return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
/**
* ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
continue;
desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, i);
+ bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
continue;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea194..d3d23d73c08b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
struct buffer_head *bh = NULL;
map_bh.b_state = 0;
- err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+ err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+ 0, 0, 0);
if (err > 0) {
pgoff_t index = map_bh.b_blocknr >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
while (n) {
/* Do the node's children first */
- if ((n)->rb_left) {
+ if (n->rb_left) {
n = n->rb_left;
continue;
}
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
parent->rb_right = NULL;
n = parent;
}
- root->rb_node = NULL;
}
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
{
struct dir_private_info *p;
- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->root.rb_node = NULL;
- p->curr_node = NULL;
- p->extra_fname = NULL;
- p->last_pos = 0;
p->curr_hash = pos2maj_hash(pos);
p->curr_minor_hash = pos2min_hash(pos);
- p->next_hash = 0;
return p;
}
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
int ret;
if (!info) {
- info = create_dir_info(filp->f_pos);
+ info = ext4_htree_create_dir_info(filp->f_pos);
if (!info)
return -ENOMEM;
filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac0..303e41cf7b14 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
#include "ext4_i.h"
/*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
*/
/*
@@ -45,7 +45,7 @@
#define ext4_debug(f, a...) \
do { \
printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
- __FILE__, __LINE__, __FUNCTION__); \
+ __FILE__, __LINE__, __func__); \
printk (KERN_DEBUG f, ## a); \
} while (0)
#else
@@ -74,6 +74,9 @@
#define EXT4_MB_HINT_GOAL_ONLY 256
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL 512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED 1024
+
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
__u32 bg_reserved2[3];
};
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+ __u32 free_inodes;
+ __u32 free_blocks;
+};
+
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do { \
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
__le16 s_mmp_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u32 s_reserved[163]; /* Padding to the end of the block */
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad2;
+ __le16 s_reserved_pad;
+ __u32 s_reserved[162]; /* Padding to the end of the block */
};
#ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, int *errp);
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks);
extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count, int metadata);
extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void);
extern void exit_ext4_mballoc(void);
extern void ext4_mb_free_blocks(handle_t *, struct inode *,
unsigned long, unsigned long, int, unsigned long *);
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+ ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
+ ext4_grpblk_t add);
/* inode.c */
+void ext4_da_release_space(struct inode *inode, int used, int to_free);
int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr);
struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
extern struct inode *ext4_iget(struct super_block *, unsigned long);
extern int ext4_write_inode (struct inode *, int);
extern int ext4_setattr (struct dentry *, struct iattr *);
+extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
extern void ext4_delete_inode (struct inode *);
extern int ext4_sync_inode (handle_t *, struct inode *);
extern void ext4_discard_reservation (struct inode *);
extern void ext4_dirty_inode(struct inode *);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate (struct inode *);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
}
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+ ext4_group_t block_group)
+{
+ return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+ return 1 << sbi->s_log_groups_per_flex;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
- __ext4_std_error((sb), __FUNCTION__, (errno)); \
+ __ext4_std_error((sb), __func__, (errno)); \
} while (0)
/*
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock,
unsigned long max_blocks, struct buffer_head *bh_result,
int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
sector_t block, unsigned long max_blocks,
struct buffer_head *bh, int create,
- int extend_disksize);
+ int extend_disksize, int flag);
#endif /* __KERNEL__ */
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fab..6c166c0a54b7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d79..ef7409f0e7e4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
};
/*
- * third extended file system inode data in memory
+ * fourth extended file system inode data in memory
*/
struct ext4_inode_info {
__le32 i_data[15]; /* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
*/
struct rw_semaphore i_data_sem;
struct inode vfs_inode;
+ struct jbd2_inode jinode;
unsigned long i_ext_generation;
struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
/* mballoc */
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
+
+ /* allocation reservation info for delalloc */
+ unsigned long i_reserved_data_blocks;
+ unsigned long i_reserved_meta_blocks;
+ unsigned long i_allocated_meta_blocks;
+ unsigned short i_delalloc_reserved_flag;
+ spinlock_t i_block_reservation_lock;
};
#endif /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b24..eb8bc3afe6e9 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
handle_t *handle, struct buffer_head *bh);
#define ext4_journal_get_undo_access(handle, bh) \
- __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_undo_access(__func__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \
- __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_write_access(__func__, (handle), (bh))
#define ext4_journal_revoke(handle, blocknr, bh) \
- __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+ __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
#define ext4_journal_get_create_access(handle, bh) \
- __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_create_access(__func__, (handle), (bh))
#define ext4_journal_dirty_metadata(handle, bh) \
- __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+ __ext4_journal_dirty_metadata(__func__, (handle), (bh))
#define ext4_journal_forget(handle, bh) \
- __ext4_journal_forget(__FUNCTION__, (handle), (bh))
-
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
+ __ext4_journal_forget(__func__, (handle), (bh))
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
}
#define ext4_journal_stop(handle) \
- __ext4_journal_stop(__FUNCTION__, (handle))
+ __ext4_journal_stop(__func__, (handle))
static inline handle_t *ext4_journal_current_handle(void)
{
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
return jbd2_journal_force_commit(journal);
}
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+ return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+}
+
/* super.c */
int ext4_force_commit(struct super_block *sb);
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f2191..6300226d5531 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
#include <linux/rbtree.h>
/*
- * third extended-fs super-block data in memory
+ * fourth extended-fs super-block data in memory
*/
struct ext4_sb_info {
unsigned long s_desc_size; /* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
/* locality groups */
struct ext4_locality_group *s_locality_groups;
+
+ unsigned int s_log_groups_per_flex;
+ struct flex_groups *s_flex_groups;
};
#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3dae..42c4c0c892ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
}
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_journal_restart(handle_t *handle, int needed)
{
int err;
if (handle->h_buffer_credits > needed)
- return handle;
- if (!ext4_journal_extend(handle, needed))
- return handle;
- err = ext4_journal_restart(handle, needed);
-
- return handle;
+ return 0;
+ err = ext4_journal_extend(handle, needed);
+ if (err)
+ return err;
+ return ext4_journal_restart(handle, needed);
}
/*
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
return bg_start + colour + block;
}
+/*
+ * Allocation for a meta data block
+ */
static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex, int *err)
{
ext4_fsblk_t goal, newblock;
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
- newblock = ext4_new_block(handle, inode, goal, err);
+ newblock = ext4_new_meta_block(handle, inode, goal, err);
return newblock;
}
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
return size;
}
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int lcap, icap, rcap, leafs, idxs, num;
+ int newextents = blocks;
+
+ rcap = ext4_ext_space_root_idx(inode);
+ lcap = ext4_ext_space_block(inode);
+ icap = ext4_ext_space_block_idx(inode);
+
+ /* number of new leaf blocks needed */
+ num = leafs = (newextents + lcap - 1) / lcap;
+
+ /*
+ * Worse case, we need separate index block(s)
+ * to link all new leaf blocks
+ */
+ idxs = (leafs + icap - 1) / icap;
+ do {
+ num += idxs;
+ idxs = (idxs + icap - 1) / icap;
+ } while (idxs > rcap);
+
+ return num;
+}
+
static int
ext4_ext_max_entries(struct inode *inode, int depth)
{
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
alloc = 1;
}
path[0].p_hdr = eh;
+ path[0].p_bh = NULL;
i = depth;
/* walk through the tree */
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
}
path[ppos].p_depth = i;
- path[ppos].p_hdr = eh;
path[ppos].p_ext = NULL;
path[ppos].p_idx = NULL;
/* find extent */
ext4_ext_binsearch(inode, path + ppos, block);
+ /* if not an empty leaf */
+ if (path[ppos].p_ext)
+ path[ppos].p_block = ext_pblock(path[ppos].p_ext);
ext4_ext_show_path(inode, path);
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
/* allocate all needed blocks */
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path,
+ newext, &err);
if (newblock == 0)
goto cleanup;
ablocks[a] = newblock;
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock;
int err = 0;
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
if (newblock == 0)
return err;
@@ -981,6 +1017,8 @@ repeat:
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
err = ext4_ext_split(handle, inode, path, newext, i);
+ if (err)
+ goto out;
/* refill path */
ext4_ext_drop_refs(path);
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
#endif
- handle = ext4_ext_journal_restart(handle, credits);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
+ err = ext4_ext_journal_restart(handle, credits);
+ if (err)
goto out;
- }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
int err = 0, depth, ret;
unsigned long allocated = 0;
struct ext4_allocation_request ar;
+ loff_t disksize;
__clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
if (allocated > max_blocks)
allocated = max_blocks;
- /* mark the buffer unwritten */
- __set_bit(BH_Unwritten, &bh_result->b_state);
+ set_buffer_unwritten(bh_result);
goto out2;
}
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
goto out2;
}
- if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = inode->i_size;
-
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
outnew:
- __set_bit(BH_New, &bh_result->b_state);
+ if (extend_disksize) {
+ disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ }
+
+ set_buffer_new(bh_result);
/* Cache only when it is _not_ an uninitialized extent */
if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2774,7 @@ out:
if (allocated > max_blocks)
allocated = max_blocks;
ext4_ext_show_leaf(inode, path);
- __set_bit(BH_Mapped, &bh_result->b_state);
+ set_buffer_mapped(bh_result);
bh_result->b_bdev = inode->i_sb->s_bdev;
bh_result->b_blocknr = newblock;
out2:
@@ -2744,7 +2785,7 @@ out2:
return err ? err : allocated;
}
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
{
struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
*/
err = ext4_writepage_trans_blocks(inode) + 3;
handle = ext4_journal_start(inode, err);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return;
- }
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
+
+ if (ext4_orphan_add(handle, inode))
+ goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
* Probably we need not scan at all,
* because page truncation is enough.
*/
- if (ext4_orphan_add(handle, inode))
- goto out_stop;
/* we have to know where to truncate from in crash case */
EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
handle->h_sync = 1;
out_stop:
+ up_write(&EXT4_I(inode)->i_data_sem);
/*
* If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above.
@@ -2808,7 +2844,6 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- up_write(&EXT4_I(inode)->i_data_sem);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
@@ -2911,7 +2946,7 @@ retry:
}
ret = ext4_get_blocks_wrap(handle, inode, block,
max_blocks, &map_bh,
- EXT4_CREATE_UNINITIALIZED_EXT, 0);
+ EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366ab..430eb7978db4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
return ret;
}
+static struct vm_operations_struct ext4_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ext4_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
+
const struct file_operations ext4_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ext4_file_mmap,
.open = generic_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
const struct inode_operations ext4_file_inode_operations = {
.truncate = ext4_truncate,
.setattr = ext4_setattr,
+ .getattr = ext4_getattr,
#ifdef CONFIG_EXT4DEV_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8d..a45c3737ad31 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/jbd2.h>
+#include <linux/blkdev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
@@ -45,6 +46,7 @@
int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret = 0;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
.nr_to_write = 0, /* sys_fsync did this */
};
ret = sync_inode(inode, &wbc);
+ if (journal && (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
}
out:
return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7eea..c2c0a8d06d0e 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
struct ext4_group_desc *gdp);
extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
ext4_group_t block_group);
extern unsigned ext4_init_block_bitmap(struct super_block *sb,
struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c801..a92eb305344f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
struct ext4_super_block * es;
struct ext4_sb_info *sbi;
int fatal = 0, err;
+ ext4_group_t flex_group;
if (atomic_read(&inode->i_count) > 1) {
printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
if (is_directory)
percpu_counter_dec(&sbi->s_dirs_counter);
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes++;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
}
BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
return ret;
}
+#define free_block_ratio 10
+
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+ ext4_group_t *best_group)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh;
+ struct flex_groups *flex_group = sbi->s_flex_groups;
+ ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+ ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+ ext4_group_t ngroups = sbi->s_groups_count;
+ int flex_size = ext4_flex_bg_size(sbi);
+ ext4_group_t best_flex = parent_fbg_group;
+ int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+ int flexbg_free_blocks;
+ int flex_freeb_ratio;
+ ext4_group_t n_fbg_groups;
+ ext4_group_t i;
+
+ n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+ sbi->s_log_groups_per_flex;
+
+find_close_to_parent:
+ flexbg_free_blocks = flex_group[best_flex].free_blocks;
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+ if (flex_group[best_flex].free_inodes &&
+ flex_freeb_ratio > free_block_ratio)
+ goto found_flexbg;
+
+ if (best_flex && best_flex == parent_fbg_group) {
+ best_flex--;
+ goto find_close_to_parent;
+ }
+
+ for (i = 0; i < n_fbg_groups; i++) {
+ if (i == parent_fbg_group || i == parent_fbg_group - 1)
+ continue;
+
+ flexbg_free_blocks = flex_group[i].free_blocks;
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+
+ if (flex_freeb_ratio > free_block_ratio &&
+ flex_group[i].free_inodes) {
+ best_flex = i;
+ goto found_flexbg;
+ }
+
+ if (best_flex < 0 ||
+ (flex_group[i].free_blocks >
+ flex_group[best_flex].free_blocks &&
+ flex_group[i].free_inodes))
+ best_flex = i;
+ }
+
+ if (!flex_group[best_flex].free_inodes ||
+ !flex_group[best_flex].free_blocks)
+ return -1;
+
+found_flexbg:
+ for (i = best_flex * flex_size; i < ngroups &&
+ i < (best_flex + 1) * flex_size; i++) {
+ desc = ext4_get_group_desc(sb, i, &bh);
+ if (le16_to_cpu(desc->bg_free_inodes_count)) {
+ *best_group = i;
+ goto out;
+ }
+ }
+
+ return -1;
+out:
+ return 0;
+}
+
/*
* Orlov's allocator for directories.
*
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
struct inode *ret;
ext4_group_t i;
int free = 0;
+ ext4_group_t flex_group;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
sbi = EXT4_SB(sb);
es = sbi->s_es;
+
+ if (sbi->s_log_groups_per_flex) {
+ ret2 = find_group_flex(sb, dir, &group);
+ goto got_group;
+ }
+
if (S_ISDIR(mode)) {
if (test_opt (sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
} else
ret2 = find_group_other(sb, dir, &group);
+got_group:
err = -ENOSPC;
if (ret2 == -1)
goto out;
@@ -600,7 +689,7 @@ got:
/* We may have to initialize the block bitmap if it isn't already */
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- struct buffer_head *block_bh = read_block_bitmap(sb, group);
+ struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
BUFFER_TRACE(block_bh, "get block bitmap access");
err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
percpu_counter_inc(&sbi->s_dirs_counter);
sb->s_dirt = 1;
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes--;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
inode->i_uid = current->fsuid;
if (test_opt (sb, GRPID))
inode->i_gid = dir->i_gid;
@@ -740,14 +836,10 @@ got:
goto fail_free_drop;
if (test_opt(sb, EXTENTS)) {
- /* set extent flag only for diretory, file and normal symlink*/
+ /* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
ext4_ext_tree_init(handle, inode);
- err = ext4_update_incompat_feature(handle, sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS);
- if (err)
- goto fail_free_drop;
}
}
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
if (IS_ERR(inode))
goto iget_failed;
+ /*
+ * If the orphans has i_nlinks > 0 then it should be able to be
+ * truncated, otherwise it won't be removed from the orphan list
+ * during processing and an infinite loop will result.
+ */
+ if (inode->i_nlink && !ext4_can_truncate(inode))
+ goto bad_orphan;
+
if (NEXT_ORPHAN(inode) > max_ino)
goto bad_orphan;
brelse(bitmap_bh);
@@ -838,6 +938,7 @@ bad_orphan:
printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+ printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d9707746413..8ca2763df091 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "ext4_extents.h"
+
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
+{
+ return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+ new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
/*
* Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
{
handle_t *handle;
+ if (ext4_should_order_data(inode))
+ ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
* direct blocks
*/
static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int indirect_blks, int blks,
- ext4_fsblk_t new_blocks[4], int *err)
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ int indirect_blks, int blks,
+ ext4_fsblk_t new_blocks[4], int *err)
{
int target, i;
- unsigned long count = 0;
+ unsigned long count = 0, blk_allocated = 0;
int index = 0;
ext4_fsblk_t current_block = 0;
int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
* the first direct block of this branch. That's the
* minimum number of blocks need to allocate(required)
*/
- target = blks + indirect_blks;
-
- while (1) {
+ /* first we try to allocate the indirect blocks */
+ target = indirect_blks;
+ while (target > 0) {
count = target;
/* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+ current_block = ext4_new_meta_blocks(handle, inode,
+ goal, &count, err);
if (*err)
goto failed_out;
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
new_blocks[index++] = current_block++;
count--;
}
-
- if (count > 0)
+ if (count > 0) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ printk(KERN_INFO "%s returned more blocks than "
+ "requested\n", __func__);
+ WARN_ON(1);
break;
+ }
}
- /* save the new block number for the first direct block */
- new_blocks[index] = current_block;
-
+ target = blks - count ;
+ blk_allocated = count;
+ if (!target)
+ goto allocated;
+ /* Now allocate data blocks */
+ count = target;
+ /* allocating blocks for data blocks */
+ current_block = ext4_new_blocks(handle, inode, iblock,
+ goal, &count, err);
+ if (*err && (target == blks)) {
+ /*
+ * if the allocation failed and we didn't allocate
+ * any blocks before
+ */
+ goto failed_out;
+ }
+ if (!*err) {
+ if (target == blks) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ }
+ blk_allocated += count;
+ }
+allocated:
/* total number of blocks allocated for direct blocks */
- ret = count;
+ ret = blk_allocated;
*err = 0;
return ret;
failed_out:
@@ -584,8 +631,9 @@ failed_out:
* as described above and return 0.
*/
static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
- int indirect_blks, int *blks, ext4_fsblk_t goal,
- ext4_lblk_t *offsets, Indirect *branch)
+ ext4_lblk_t iblock, int indirect_blks,
+ int *blks, ext4_fsblk_t goal,
+ ext4_lblk_t *offsets, Indirect *branch)
{
int blocksize = inode->i_sb->s_blocksize;
int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
ext4_fsblk_t new_blocks[4];
ext4_fsblk_t current_block;
- num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+ num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
*blks, new_blocks, &err);
if (err)
return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
struct ext4_inode_info *ei = EXT4_I(inode);
int count = 0;
ext4_fsblk_t first_block = 0;
+ loff_t disksize;
J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
/*
* Block out ext4_truncate while we alter the tree
*/
- err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
- offsets + (partial - chain), partial);
+ err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+ &count, goal,
+ offsets + (partial - chain), partial);
/*
* The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
* protect it if you're about to implement concurrent
* ext4_get_block() -bzzz
*/
- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
- ei->i_disksize = inode->i_size;
+ if (!err && extend_disksize) {
+ disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > ei->i_disksize)
+ ei->i_disksize = disksize;
+ }
if (err)
goto cleanup;
@@ -934,7 +989,7 @@ out:
*/
int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
unsigned long max_blocks, struct buffer_head *bh,
- int create, int extend_disksize)
+ int create, int extend_disksize, int flag)
{
int retval;
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
* with create == 1 flag.
*/
down_write((&EXT4_I(inode)->i_data_sem));
+
+ /*
+ * if the caller is from delayed allocation writeout path
+ * we have already reserved fs blocks for allocation
+ * let the underlying get_block() function know to
+ * avoid double accounting
+ */
+ if (flag)
+ EXT4_I(inode)->i_delalloc_reserved_flag = 1;
/*
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
~EXT4_EXT_MIGRATE;
}
}
+
+ if (flag) {
+ EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+ /*
+ * Update reserved blocks/metadata blocks
+ * after successful block allocation
+ * which were deferred till now
+ */
+ if ((retval > 0) && buffer_delay(bh))
+ ext4_da_release_space(inode, retval, 0);
+ }
+
up_write((&EXT4_I(inode)->i_data_sem));
return retval;
}
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
}
ret = ext4_get_blocks_wrap(handle, inode, iblock,
- max_blocks, bh_result, create, 0);
+ max_blocks, bh_result, create, 0, 0);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
dummy.b_blocknr = -1000;
buffer_trace_init(&dummy.b_history);
err = ext4_get_blocks_wrap(handle, inode, block, 1,
- &dummy, create, 1);
+ &dummy, create, 1, 0);
/*
* ext4_get_blocks_handle() returns number of blocks
* mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
to = from + len;
retry:
- page = __grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
-
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
- unlock_page(page);
- page_cache_release(page);
ret = PTR_ERR(handle);
goto out;
}
+ page = __grab_cache_page(mapping, index);
+ if (!page) {
+ ext4_journal_stop(handle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ *pagep = page;
+
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
ext4_get_block);
@@ -1225,8 +1302,8 @@ retry:
}
if (ret) {
- ext4_journal_stop(handle);
unlock_page(page);
+ ext4_journal_stop(handle);
page_cache_release(page);
}
@@ -1236,15 +1313,6 @@ out:
return ret;
}
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- int err = jbd2_journal_dirty_data(handle, bh);
- if (err)
- ext4_journal_abort_handle(__func__, __func__,
- bh, handle, err);
- return err;
-}
-
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
}
/*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = file->f_mapping->host;
-
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
- }
-
- return copied;
-}
-
-/*
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
*
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = mapping->host;
unsigned from, to;
int ret = 0, ret2;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, ext4_journal_dirty_data);
+ ret = ext4_jbd2_file_inode(handle, inode);
if (ret == 0) {
/*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
new_i_size = pos + copied;
if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
return ret ? ret : copied;
}
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = mapping->host;
int ret = 0, ret2;
loff_t new_i_size;
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
return ret ? ret : copied;
}
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
ret = ret2;
}
+ unlock_page(page);
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
page_cache_release(page);
return ret ? ret : copied;
}
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ind_blks, dind_blks, tind_blks;
+
+ /* number of new indirect blocks needed */
+ ind_blks = (blocks + icap - 1) / icap;
+
+ dind_blks = (ind_blks + icap - 1) / icap;
+
+ tind_blks = 1;
+
+ return ind_blks + dind_blks + tind_blks;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_calc_metadata_amount(inode, blocks);
+
+ return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned long md_needed, mdblocks, total = 0;
+
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+ mdblocks = ext4_calc_metadata_amount(inode, total);
+ BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+
+ md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+ total = md_needed + nrblocks;
+
+ if (ext4_has_free_blocks(sbi, total) < total) {
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return -ENOSPC;
+ }
+
+ /* reduce fs free blocks counter */
+ percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+
+ EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+ EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return 0; /* success */
+}
+
+void ext4_da_release_space(struct inode *inode, int used, int to_free)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int total, mdb, mdb_free, release;
+
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ /* recalculate the number of metablocks still need to be reserved */
+ total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+ mdb = ext4_calc_metadata_amount(inode, total);
+
+ /* figure out how many metablocks to release */
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+ mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+
+ /* Account for allocated meta_blocks */
+ mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+ release = to_free + mdb_free;
+
+ /* update fs free blocks counter for truncate case */
+ percpu_counter_add(&sbi->s_freeblocks_counter, release);
+
+ /* update per-inode reservations */
+ BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+ EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+ EXT4_I(inode)->i_allocated_meta_blocks = 0;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+ unsigned long offset)
+{
+ int to_release = 0;
+ struct buffer_head *head, *bh;
+ unsigned int curr_off = 0;
+
+ head = page_buffers(page);
+ bh = head;
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+
+ if ((offset <= curr_off) && (buffer_delay(bh))) {
+ to_release++;
+ clear_buffer_delay(bh);
+ }
+ curr_off = next_off;
+ } while ((bh = bh->b_this_page) != head);
+ ext4_da_release_space(page->mapping->host, 0, to_release);
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+ struct inode *inode;
+ struct buffer_head lbh; /* extent of blocks */
+ unsigned long first_page, next_page; /* extent of pages */
+ get_block_t *get_block;
+ struct writeback_control *wbc;
+};
+
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with __mpage_writepage()
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+ struct address_space *mapping = mpd->inode->i_mapping;
+ struct mpage_data mpd_pp = {
+ .bio = NULL,
+ .last_block_in_bio = 0,
+ .get_block = mpd->get_block,
+ .use_writepage = 1,
+ };
+ int ret = 0, err, nr_pages, i;
+ unsigned long index, end;
+ struct pagevec pvec;
+
+ BUG_ON(mpd->next_page <= mpd->first_page);
+
+ pagevec_init(&pvec, 0);
+ index = mpd->first_page;
+ end = mpd->next_page - 1;
+
+ while (index <= end) {
+ /* XXX: optimize tail */
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+
+ /*
+ * In error case, we have to continue because
+ * remaining pages are still locked
+ * XXX: unlock and re-dirty them?
+ */
+ if (ret == 0)
+ ret = err;
+ }
+ pagevec_release(&pvec);
+ }
+ if (mpd_pp.bio)
+ mpage_bio_submit(WRITE, mpd_pp.bio);
+
+ return ret;
+}
+
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+ struct buffer_head *exbh)
+{
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
+ int blocks = exbh->b_size >> inode->i_blkbits;
+ sector_t pblock = exbh->b_blocknr, cur_logical;
+ struct buffer_head *head, *bh;
+ unsigned long index, end;
+ struct pagevec pvec;
+ int nr_pages, i;
+
+ index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ pagevec_init(&pvec, 0);
+
+ while (index <= end) {
+ /* XXX: optimize tail */
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+ BUG_ON(!page_has_buffers(page));
+
+ bh = page_buffers(page);
+ head = bh;
+
+ /* skip blocks out of the range */
+ do {
+ if (cur_logical >= logical)
+ break;
+ cur_logical++;
+ } while ((bh = bh->b_this_page) != head);
+
+ do {
+ if (cur_logical >= logical + blocks)
+ break;
+ if (buffer_delay(bh)) {
+ bh->b_blocknr = pblock;
+ clear_buffer_delay(bh);
+ } else if (buffer_mapped(bh))
+ BUG_ON(bh->b_blocknr != pblock);
+
+ cur_logical++;
+ pblock++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+ pagevec_release(&pvec);
+ }
+}
+
+
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+ struct buffer_head *bh)
+{
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int blocks, i;
+
+ blocks = bh->b_size >> inode->i_blkbits;
+ for (i = 0; i < blocks; i++)
+ unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ * The function ignores errors ->get_block() returns, thus real
+ * error handling is postponed to __mpage_writepage()
+ */
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+ struct buffer_head *lbh = &mpd->lbh;
+ int err = 0, remain = lbh->b_size;
+ sector_t next = lbh->b_blocknr;
+ struct buffer_head new;
+
+ /*
+ * We consider only non-mapped and non-allocated blocks
+ */
+ if (buffer_mapped(lbh) && !buffer_delay(lbh))
+ return;
+
+ while (remain) {
+ new.b_state = lbh->b_state;
+ new.b_blocknr = 0;
+ new.b_size = remain;
+ err = mpd->get_block(mpd->inode, next, &new, 1);
+ if (err) {
+ /*
+ * Rather than implement own error handling
+ * here, we just leave remaining blocks
+ * unallocated and try again with ->writepage()
+ */
+ break;
+ }
+ BUG_ON(new.b_size == 0);
+
+ if (buffer_new(&new))
+ __unmap_underlying_blocks(mpd->inode, &new);
+
+ /*
+ * If blocks are delayed marked, we need to
+ * put actual blocknr and drop delayed bit
+ */
+ if (buffer_delay(lbh))
+ mpage_put_bnr_to_bhs(mpd, next, &new);
+
+ /* go for the remaining blocks */
+ next += new.b_size >> mpd->inode->i_blkbits;
+ remain -= new.b_size;
+ }
+}
+
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+ sector_t logical, struct buffer_head *bh)
+{
+ struct buffer_head *lbh = &mpd->lbh;
+ sector_t next;
+
+ next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+
+ /*
+ * First block in the extent
+ */
+ if (lbh->b_size == 0) {
+ lbh->b_blocknr = logical;
+ lbh->b_size = bh->b_size;
+ lbh->b_state = bh->b_state & BH_FLAGS;
+ return;
+ }
+
+ /*
+ * Can we merge the block to our big extent?
+ */
+ if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+ lbh->b_size += bh->b_size;
+ return;
+ }
+
+ /*
+ * We couldn't merge the block to our extent, so we
+ * need to flush current extent and start new one
+ */
+ mpage_da_map_blocks(mpd);
+
+ /*
+ * Now start a new extent
+ */
+ lbh->b_size = bh->b_size;
+ lbh->b_state = bh->b_state & BH_FLAGS;
+ lbh->b_blocknr = logical;
+}
+
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+ struct writeback_control *wbc, void *data)
+{
+ struct mpage_da_data *mpd = data;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *bh, *head, fake;
+ sector_t logical;
+
+ /*
+ * Can we merge this page to current extent?
+ */
+ if (mpd->next_page != page->index) {
+ /*
+ * Nope, we can't. So, we map non-allocated blocks
+ * and start IO on them using __mpage_writepage()
+ */
+ if (mpd->next_page != mpd->first_page) {
+ mpage_da_map_blocks(mpd);
+ mpage_da_submit_io(mpd);
+ }
+
+ /*
+ * Start next extent of pages ...
+ */
+ mpd->first_page = page->index;
+
+ /*
+ * ... and blocks
+ */
+ mpd->lbh.b_size = 0;
+ mpd->lbh.b_state = 0;
+ mpd->lbh.b_blocknr = 0;
+ }
+
+ mpd->next_page = page->index + 1;
+ logical = (sector_t) page->index <<
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ if (!page_has_buffers(page)) {
+ /*
+ * There is no attached buffer heads yet (mmap?)
+ * we treat the page asfull of dirty blocks
+ */
+ bh = &fake;
+ bh->b_size = PAGE_CACHE_SIZE;
+ bh->b_state = 0;
+ set_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ mpage_add_bh_to_extent(mpd, logical, bh);
+ } else {
+ /*
+ * Page with regular buffer heads, just add all dirty ones
+ */
+ head = page_buffers(page);
+ bh = head;
+ do {
+ BUG_ON(buffer_locked(bh));
+ if (buffer_dirty(bh))
+ mpage_add_bh_to_extent(mpd, logical, bh);
+ logical++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+
+ return 0;
+}
+
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * In order to avoid duplication of logic that deals with partial pages,
+ * multiple bio per page, etc, we find non-allocated blocks, allocate
+ * them with minimal calls to ->get_block() and re-use __mpage_writepage()
+ *
+ * It's important that we call __mpage_writepage() only once for each
+ * involved page, otherwise we'd have to implement more complicated logic
+ * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
+ *
+ * See comments to mpage_writepages()
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ get_block_t get_block)
+{
+ struct mpage_da_data mpd;
+ int ret;
+
+ if (!get_block)
+ return generic_writepages(mapping, wbc);
+
+ mpd.wbc = wbc;
+ mpd.inode = mapping->host;
+ mpd.lbh.b_size = 0;
+ mpd.lbh.b_state = 0;
+ mpd.lbh.b_blocknr = 0;
+ mpd.first_page = 0;
+ mpd.next_page = 0;
+ mpd.get_block = get_block;
+
+ ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+
+ /*
+ * Handle last extent of pages
+ */
+ if (mpd.next_page != mpd.first_page) {
+ mpage_da_map_blocks(&mpd);
+ mpage_da_submit_io(&mpd);
+ }
+
+ return ret;
+}
+
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+
+ BUG_ON(create == 0);
+ BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+ /*
+ * first, we need to know whether the block is allocated already
+ * preallocated blocks are unmapped but should treated
+ * the same as allocated blocks.
+ */
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
+ if ((ret == 0) && !buffer_delay(bh_result)) {
+ /* the block isn't (pre)allocated yet, let's reserve space */
+ /*
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
+ */
+ ret = ext4_da_reserve_space(inode, 1);
+ if (ret)
+ /* not enough space to reserve */
+ return ret;
+
+ map_bh(bh_result, inode->i_sb, 0);
+ set_buffer_new(bh_result);
+ set_buffer_delay(bh_result);
+ } else if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+
+ return ret;
+}
+#define EXT4_DELALLOC_RSVED 1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ loff_t disksize = EXT4_I(inode)->i_disksize;
+ handle_t *handle = NULL;
+
+ handle = ext4_journal_current_handle();
+ if (!handle) {
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, 0, 0, 0);
+ BUG_ON(!ret);
+ } else {
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, create, 0, EXT4_DELALLOC_RSVED);
+ }
+
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+
+ /*
+ * Update on-disk size along with block allocation
+ * we don't use 'extend_disksize' as size may change
+ * within already allocated block -bzzz
+ */
+ disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ /*
+ * XXX: replace with spinlock if seen contended -bzzz
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+ if (EXT4_I(inode)->i_disksize == disksize) {
+ ret = ext4_mark_inode_dirty(handle, inode);
+ return ret;
+ }
+ }
+ ret = 0;
+ }
+ return ret;
+}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+ /*
+ * unmapped buffer is possible for holes.
+ * delay buffer is possible with delayed allocation
+ */
+ return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+ /*
+ * we don't want to do block allocation in writepage
+ * so call get_block_wrap with create = 0
+ */
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+ bh_result, 0, 0, 0);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ return ret;
+}
+
+/*
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
+ */
+static int ext4_da_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ loff_t size;
+ unsigned long len;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;
+
+ size = i_size_read(inode);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ page_bufs = page_buffers(page);
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_unmapped_or_delay)) {
+ /*
+ * We don't want to do block allocation
+ * So redirty the page and return
+ * We may reach here when we do a journal commit
+ * via journal_submit_inode_data_buffers.
+ * If we don't have mapping block we just ignore
+ * them. We can also reach here via shrink_page_list
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
+ /*
+ * The test for page_has_buffers() is subtle:
+ * We know the page is dirty but it lost buffers. That means
+ * that at some moment in time after write_begin()/write_end()
+ * has been called all buffers have been clean and thus they
+ * must have been written at least once. So they are all
+ * mapped and we can happily proceed with mapping them
+ * and writing the page.
+ *
+ * Try to initialize the buffer_heads and check whether
+ * all are mapped and non delay. We don't want to
+ * do block allocation here.
+ */
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+ ext4_normal_get_block_write);
+ if (!ret) {
+ page_bufs = page_buffers(page);
+ /* check whether all are mapped and non delay */
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_unmapped_or_delay)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
+ /*
+ * We can't do block allocation here
+ * so just redity the page and unlock
+ * and return
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+ else
+ ret = block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
+
+ return ret;
+}
+
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS
+
+static int ext4_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ handle_t *handle = NULL;
+ int needed_blocks;
+ int ret = 0;
+ long to_write;
+ loff_t range_start = 0;
+
+ /*
+ * No pages to write? This is mainly a kludge to avoid starting
+ * a transaction for special inodes like journal inode on last iput()
+ * because that could violate lock ordering on umount
+ */
+ if (!mapping->nrpages)
+ return 0;
+
+ /*
+ * Estimate the worse case needed credits to write out
+ * EXT4_MAX_BUF_BLOCKS pages
+ */
+ needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+ to_write = wbc->nr_to_write;
+ if (!wbc->range_cyclic) {
+ /*
+ * If range_cyclic is not set force range_cont
+ * and save the old writeback_index
+ */
+ wbc->range_cont = 1;
+ range_start = wbc->range_start;
+ }
+
+ while (!ret && to_write) {
+ /* start a new transaction*/
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ if (ext4_should_order_data(inode)) {
+ /*
+ * With ordered mode we need to add
+ * the inode to the journal handle
+ * when we do block allocation.
+ */
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out_writepages;
+ }
+
+ }
+ /*
+ * set the max dirty pages could be write at a time
+ * to fit into the reserved transaction credits
+ */
+ if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+ wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+
+ to_write -= wbc->nr_to_write;
+ ret = mpage_da_writepages(mapping, wbc,
+ ext4_da_get_block_write);
+ ext4_journal_stop(handle);
+ if (wbc->nr_to_write) {
+ /*
+ * There is no more writeout needed
+ * or we requested for a noblocking writeout
+ * and we found the device congested
+ */
+ to_write += wbc->nr_to_write;
+ break;
+ }
+ wbc->nr_to_write = to_write;
+ }
+
+out_writepages:
+ wbc->nr_to_write = to_write;
+ if (range_start)
+ wbc->range_start = range_start;
+ return ret;
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret, retries = 0;
+ struct page *page;
+ pgoff_t index;
+ unsigned from, to;
+ struct inode *inode = mapping->host;
+ handle_t *handle;
+
+ index = pos >> PAGE_CACHE_SHIFT;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
+
+retry:
+ /*
+ * With delayed allocation, we don't log the i_disksize update
+ * if there is delayed block allocation. But we still need
+ * to journalling the i_disksize update if writes to the end
+ * of file which has an already mapped buffer.
+ */
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ page = __grab_cache_page(mapping, index);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ext4_da_get_block_prep);
+ if (ret < 0) {
+ unlock_page(page);
+ ext4_journal_stop(handle);
+ page_cache_release(page);
+ }
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+out:
+ return ret;
+}
+
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+ unsigned long offset)
+{
+ struct buffer_head *bh;
+ struct inode *inode = page->mapping->host;
+ unsigned int idx;
+ int i;
+
+ bh = page_buffers(page);
+ idx = offset >> inode->i_blkbits;
+
+ for (i=0; i < idx; i++)
+ bh = bh->b_this_page;
+
+ if (!buffer_mapped(bh) || (buffer_delay(bh)))
+ return 0;
+ return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ handle_t *handle = ext4_journal_current_handle();
+ loff_t new_i_size;
+ unsigned long start, end;
+
+ start = pos & (PAGE_CACHE_SIZE - 1);
+ end = start + copied -1;
+
+ /*
+ * generic_write_end() will run mark_inode_dirty() if i_size
+ * changes. So let's piggyback the i_disksize mark_inode_dirty
+ * into that.
+ */
+
+ new_i_size = pos + copied;
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ if (ext4_da_should_update_i_disksize(page, end)) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ /*
+ * Updating i_disksize when extending file
+ * without needing block allocation
+ */
+ if (ext4_should_order_data(inode))
+ ret = ext4_jbd2_file_inode(handle,
+ inode);
+
+ EXT4_I(inode)->i_disksize = new_i_size;
+ }
+ up_write(&EXT4_I(inode)->i_data_sem);
+ }
+ }
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
+ page, fsdata);
+ copied = ret2;
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+
+ return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+ /*
+ * Drop reserved blocks
+ */
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ goto out;
+
+ ext4_da_page_release_reservation(page, offset);
+
+out:
+ ext4_invalidatepage(page, offset);
+
+ return;
+}
+
/*
* bmap() is special. It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+ test_opt(inode->i_sb, DELALLOC)) {
+ /*
+ * With delalloc we want to sync the file
+ * so that we can make sure we allocate
+ * blocks for file
+ */
+ filemap_write_and_wait(mapping);
+ }
+
if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
/*
* This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
- if (buffer_mapped(bh))
- return ext4_journal_dirty_data(handle, bh);
- return 0;
-}
-
/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
*
- * In all journalling modes block_write_full_page() will start the I/O.
+ * In all journaling modes block_write_full_page() will start the I/O.
*
* Problem:
*
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
* disastrous. Any write() or metadata operation will sync the fs for
* us.
*
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
*/
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_normal_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- struct buffer_head *page_bufs;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
-
- J_ASSERT(PageLocked(page));
-
- /*
- * We give up here if we're reentered, because it might be for a
- * different filesystem.
- */
- if (ext4_journal_current_handle())
- goto out_fail;
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (test_opt(inode->i_sb, NOBH))
+ return nobh_writepage(page,
+ ext4_normal_get_block_write, wbc);
+ else
+ return block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
+}
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
- }
+static int ext4_normal_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ loff_t size = i_size_read(inode);
+ loff_t len;
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
+ J_ASSERT(PageLocked(page));
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* if page has buffers it should all be mapped
+ * and allocated. If there are not buffers attached
+ * to the page we know the page is dirty but it lost
+ * buffers. That means that at some moment in time
+ * after write_begin() / write_end() has been called
+ * all buffers have been clean and thus they must have been
+ * written at least once. So they are all mapped and we can
+ * happily proceed with mapping them and writing the page.
+ */
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped_or_delay));
}
- page_bufs = page_buffers(page);
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bget_one);
-
- ret = block_write_full_page(page, ext4_get_block, wbc);
- /*
- * The page can become unlocked at any point now, and
- * truncate can then come in and change things. So we
- * can't touch *page from now on. But *page_bufs is
- * safe due to elevated refcount.
- */
+ if (!ext4_journal_current_handle())
+ return __ext4_normal_writepage(page, wbc);
- /*
- * And attach them to the current transaction. But only if
- * block_write_full_page() succeeded. Otherwise they are unmapped,
- * and generally junk.
- */
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
- NULL, jbd2_journal_dirty_data_fn);
- if (!ret)
- ret = err;
- }
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bput_one);
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-
-out_fail:
redirty_page_for_writepage(wbc, page);
unlock_page(page);
- return ret;
+ return 0;
}
-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head *page_bufs;
handle_t *handle = NULL;
int ret = 0;
int err;
- if (ext4_journal_current_handle())
- goto out_fail;
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+ ext4_normal_get_block_write);
+ if (ret != 0)
+ goto out_unlock;
+
+ page_bufs = page_buffers(page);
+ walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+ bget_one);
+ /* As soon as we unlock the page, it can go away, but we have
+ * references to buffers so we are safe */
+ unlock_page(page);
handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_fail;
+ goto out;
}
- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_get_block, wbc);
- else
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ ret = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+ err = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, write_end_fn);
+ if (ret == 0)
+ ret = err;
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
- return ret;
-out_fail:
- redirty_page_for_writepage(wbc, page);
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bput_one);
+ EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ goto out;
+
+out_unlock:
unlock_page(page);
+out:
return ret;
}
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
+ loff_t size = i_size_read(inode);
+ loff_t len;
- if (ext4_journal_current_handle())
- goto no_write;
+ J_ASSERT(PageLocked(page));
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* if page has buffers it should all be mapped
+ * and allocated. If there are not buffers attached
+ * to the page we know the page is dirty but it lost
+ * buffers. That means that at some moment in time
+ * after write_begin() / write_end() has been called
+ * all buffers have been clean and thus they must have been
+ * written at least once. So they are all mapped and we can
+ * happily proceed with mapping them and writing the page.
+ */
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped_or_delay));
+ }
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
+ if (ext4_journal_current_handle())
goto no_write;
- }
- if (!page_has_buffers(page) || PageChecked(page)) {
+ if (PageChecked(page)) {
/*
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
*/
ClearPageChecked(page);
- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext4_get_block);
- if (ret != 0) {
- ext4_journal_stop(handle);
- goto out_unlock;
- }
- ret = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
- err = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, write_end_fn);
- if (ret == 0)
- ret = err;
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
- unlock_page(page);
+ return __ext4_journalled_writepage(page, wbc);
} else {
/*
* It may be a page full of checkpoint-mode buffers. We don't
* really know unless we go poke around in the buffer_heads.
* But block_write_full_page will do the right thing.
*/
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ return block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
}
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
-out:
- return ret;
-
no_write:
redirty_page_for_writepage(wbc, page);
-out_unlock:
unlock_page(page);
- goto out;
+ return 0;
}
static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
static const struct address_space_operations ext4_ordered_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
- .writepage = ext4_ordered_writepage,
+ .writepage = ext4_normal_writepage,
.sync_page = block_sync_page,
.write_begin = ext4_write_begin,
.write_end = ext4_ordered_write_end,
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = {
static const struct address_space_operations ext4_writeback_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
- .writepage = ext4_writeback_writepage,
+ .writepage = ext4_normal_writepage,
.sync_page = block_sync_page,
.write_begin = ext4_write_begin,
.write_end = ext4_writeback_write_end,
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = {
.releasepage = ext4_releasepage,
};
+static const struct address_space_operations ext4_da_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_da_writepage,
+ .writepages = ext4_da_writepages,
+ .sync_page = block_sync_page,
+ .write_begin = ext4_da_write_begin,
+ .write_end = ext4_da_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_da_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+};
+
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode))
+ if (ext4_should_order_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else if (ext4_should_order_data(inode))
inode->i_mapping->a_ops = &ext4_ordered_aops;
+ else if (ext4_should_writeback_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
else if (ext4_should_writeback_data(inode))
inode->i_mapping->a_ops = &ext4_writeback_aops;
else
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode)
* This required during truncate. We need to physically zero the tail end
* of that block so it doesn't yield old data if the file is later grown.
*/
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
+ struct page *page;
int err = 0;
+ page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return -EINVAL;
+
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
err = ext4_journal_dirty_metadata(handle, bh);
} else {
if (ext4_should_order_data(inode))
- err = ext4_journal_dirty_data(handle, bh);
+ err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh);
}
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
if (this_bh) {
BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, this_bh);
+
+ /*
+ * The buffer head should have an attached journal head at this
+ * point. However, if the data is corrupted and an indirect
+ * block pointed to itself, it would have been detached when
+ * the block was cleared. Check for this instead of OOPSing.
+ */
+ if (bh2jh(this_bh))
+ ext4_journal_dirty_metadata(handle, this_bh);
+ else
+ ext4_error(inode->i_sb, __func__,
+ "circular indirect block detected, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long) this_bh->b_blocknr);
}
}
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
}
+int ext4_can_truncate(struct inode *inode)
+{
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return 0;
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext4_inode_is_fast_symlink(inode);
+ return 0;
+}
+
/*
* ext4_truncate()
*
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode)
int n;
ext4_lblk_t last_block;
unsigned blocksize = inode->i_sb->s_blocksize;
- struct page *page;
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
- if (ext4_inode_is_fast_symlink(inode))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ if (!ext4_can_truncate(inode))
return;
- /*
- * We have to lock the EOF page here, because lock_page() nests
- * outside jbd2_journal_start().
- */
- if ((inode->i_size & (blocksize - 1)) == 0) {
- /* Block boundary? Nothing to do */
- page = NULL;
- } else {
- page = grab_cache_page(mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
- if (!page)
- return;
- }
-
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- ext4_ext_truncate(inode, page);
+ ext4_ext_truncate(inode);
return;
}
handle = start_transaction(inode);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return; /* AKPM: return what? */
- }
last_block = (inode->i_size + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (blocksize - 1))
+ if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+ goto out_stop;
n = ext4_block_to_path(inode, last_block, offsets, NULL);
if (n == 0)
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode)
goto out_stop;
/*
+ * From here we block out all ext4_get_block() callers who want to
+ * modify the block allocation tree.
+ */
+ down_write(&ei->i_data_sem);
+ /*
* The orphan list entry will now protect us from any crash which
* occurs before the truncate completes, so it is now safe to propagate
* the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
*/
ei->i_disksize = inode->i_size;
- /*
- * From here we block out all ext4_get_block() callers who want to
- * modify the block allocation tree.
- */
- down_write(&ei->i_data_sem);
-
if (n == 1) { /* direct blocks */
ext4_free_data(handle, inode, NULL, i_data+offsets[0],
i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait)
* be freed, so we have a strong guarantee that no future commit will
* leave these blocks visible to the user.)
*
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
*/
int ext4_setattr(struct dentry *dentry, struct iattr *attr)
{
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (!error)
error = rc;
ext4_journal_stop(handle);
+
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
+ attr->ia_size);
+ if (error) {
+ /* Do as much error cleanup as possible */
+ handle = ext4_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ goto err_out;
+ }
+ }
}
rc = inode_setattr(inode, attr);
@@ -3193,6 +4231,32 @@ err_out:
return error;
}
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode;
+ unsigned long delalloc_blocks;
+
+ inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+
+ /*
+ * We can't update i_blocks if the block allocation is delayed
+ * otherwise in the case of system crash before the real block
+ * allocation is done, we will have i_blocks inconsistent with
+ * on-disk file blocks.
+ * We always keep i_blocks updated together with real
+ * allocation. But to not confuse with user, stat
+ * will return the blocks that include the delayed allocation
+ * blocks for this file.
+ */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+ return 0;
+}
/*
* How many blocks doth make a writepage()?
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ loff_t size;
+ unsigned long len;
+ int ret = -EINVAL;
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+ * get i_mutex because we are already holding mmap_sem.
+ */
+ down_read(&inode->i_alloc_sem);
+ size = i_size_read(inode);
+ if (page->mapping != mapping || size <= page_offset(page)
+ || !PageUptodate(page)) {
+ /* page got truncated from under us? */
+ goto out_unlock;
+ }
+ ret = 0;
+ if (PageMappedToDisk(page))
+ goto out_unlock;
+
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* return if we have all the buffers mapped */
+ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped))
+ goto out_unlock;
+ }
+ /*
+ * OK, we need to fill the hole... Do write_begin write_end
+ * to do block allocation/reservation.We are not holding
+ * inode.i__mutex here. That allow * parallel write_begin,
+ * write_end call. lock_page prevent this from happening
+ * on the same page though
+ */
+ ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+ len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+ if (ret < 0)
+ goto out_unlock;
+ ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+ len, len, page, NULL);
+ if (ret < 0)
+ goto out_unlock;
+ ret = 0;
+out_unlock:
+ up_read(&inode->i_alloc_sem);
+ return ret;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c9900aade150..8d141a25bbee 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_zero_bit(addr, max, start) - fix;
+ ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static inline int mb_find_next_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_bit(addr, max, start) - fix;
+ ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (!buffer_uptodate(bh[i]))
goto out;
+ err = 0;
first_block = page->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
int group;
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
int pnum;
int poff;
struct page *page;
+ int ret;
mb_debug("load group %lu\n", group);
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
if (!PageUptodate(page)) {
- ext4_mb_init_cache(page, NULL);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
mb_cmp_bitmaps(e4b, page_address(page) +
(poff * sb->s_blocksize));
}
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
goto err;
+ }
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
mark_page_accessed(page);
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
- if (!PageUptodate(page))
- ext4_mb_init_cache(page, e4b->bd_bitmap);
-
+ if (!PageUptodate(page)) {
+ ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ }
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
goto err;
+ }
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
mark_page_accessed(page);
@@ -962,7 +982,7 @@ err:
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
- return -EIO;
+ return ret;
}
static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
}
}
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
int first, int count)
{
int block = 0;
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
blocknr += block;
blocknr +=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
+ ext4_unlock_group(sb, e4b->bd_group);
ext4_error(sb, __func__, "double-free of inode"
" %lu's block %llu(bit %u in group %lu)\n",
inode ? inode->i_ino : 0, blocknr, block,
e4b->bd_group);
+ ext4_lock_group(sb, e4b->bd_group);
}
mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
} while (1);
}
mb_check_buddy(e4b);
-
- return 0;
}
static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
spin_unlock(&sbi->s_md_lock);
}
-
- /* searching for the right group start from the goal value specified */
- group = ac->ac_g_ex.fe_group;
-
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
/*
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
repeat:
for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
ac->ac_criteria = cr;
+ /*
+ * searching for the right group start
+ * from the goal value specified
+ */
+ group = ac->ac_g_ex.fe_group;
+
for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
struct ext4_group_info *grp;
struct ext4_group_desc *desc;
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
int rc;
int size;
+ if (unlikely(sbi->s_mb_history == NULL))
+ return -ENOMEM;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s == NULL)
return -ENOMEM;
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb)
sbi->s_mb_history_cur = 0;
spin_lock_init(&sbi->s_mb_history_lock);
i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
- if (likely(sbi->s_mb_history != NULL))
- memset(sbi->s_mb_history, 0, i);
+ sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
/* if we can't allocate history, then we simple won't use it */
}
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
#define ext4_mb_history_init(sb)
#endif
+
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
+{
+ int i, len;
+ int metalen = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_info **meta_group_info;
+
+ /*
+ * First check if this group is the first of a reserved block.
+ * If it's true, we have to allocate a new table of pointers
+ * to ext4_group_info structures
+ */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+ metalen = sizeof(*meta_group_info) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ meta_group_info = kmalloc(metalen, GFP_KERNEL);
+ if (meta_group_info == NULL) {
+ printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+ "buddy group\n");
+ goto exit_meta_group_info;
+ }
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+ meta_group_info;
+ }
+
+ /*
+ * calculate needed size. if change bb_counters size,
+ * don't forget about ext4_mb_generate_buddy()
+ */
+ len = offsetof(typeof(**meta_group_info),
+ bb_counters[sb->s_blocksize_bits + 2]);
+
+ meta_group_info =
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+ i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+ meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+ if (meta_group_info[i] == NULL) {
+ printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+ goto exit_group_info;
+ }
+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+ &(meta_group_info[i]->bb_state));
+
+ /*
+ * initialize bb_free to be able to skip
+ * empty groups without initialization
+ */
+ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ meta_group_info[i]->bb_free =
+ ext4_free_blocks_after_init(sb, group, desc);
+ } else {
+ meta_group_info[i]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+
+#ifdef DOUBLE_CHECK
+ {
+ struct buffer_head *bh;
+ meta_group_info[i]->bb_bitmap =
+ kmalloc(sb->s_blocksize, GFP_KERNEL);
+ BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+ bh = ext4_read_block_bitmap(sb, group);
+ BUG_ON(bh == NULL);
+ memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+ sb->s_blocksize);
+ put_bh(bh);
+ }
+#endif
+
+ return 0;
+
+exit_group_info:
+ /* If a meta_group_info table has been allocated, release it now */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+ kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+ return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+
+/*
+ * Add a group to the existing groups.
+ * This function is used for online resize
+ */
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ int blocks_per_page;
+ int block;
+ int pnum;
+ struct page *page;
+ int err;
+
+ /* Add group based on group descriptor*/
+ err = ext4_mb_add_groupinfo(sb, group, desc);
+ if (err)
+ return err;
+
+ /*
+ * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
+ * datas) are set not up to date so that they will be re-initilaized
+ * during the next call to ext4_mb_load_buddy
+ */
+
+ /* Set buddy page as not up to date */
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Set bitmap page as not up to date */
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ return 0;
+}
+
+/*
+ * Update an existing group.
+ * This function is used for online resize
+ */
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
+{
+ grp->bb_free += add;
+}
+
static int ext4_mb_init_backend(struct super_block *sb)
{
ext4_group_t i;
- int j, len, metalen;
+ int metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int num_meta_group_infos =
- (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
- EXT4_DESC_PER_BLOCK_BITS(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int num_meta_group_infos;
+ int num_meta_group_infos_max;
+ int array_size;
struct ext4_group_info **meta_group_info;
+ struct ext4_group_desc *desc;
+
+ /* This is the number of blocks used by GDT */
+ num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+ 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
+
+ /*
+ * This is the total number of blocks used by GDT including
+ * the number of reserved blocks for GDT.
+ * The s_group_info array is allocated with this value
+ * to allow a clean online resize without a complex
+ * manipulation of pointer.
+ * The drawback is the unused memory when no resize
+ * occurs but it's very low in terms of pages
+ * (see comments below)
+ * Need to handle this properly when META_BG resizing is allowed
+ */
+ num_meta_group_infos_max = num_meta_group_infos +
+ le16_to_cpu(es->s_reserved_gdt_blocks);
+ /*
+ * array_size is the size of s_group_info array. We round it
+ * to the next power of two because this approximation is done
+ * internally by kmalloc so we can have some more memory
+ * for free here (e.g. may be used for META_BG resize).
+ */
+ array_size = 1;
+ while (array_size < sizeof(*sbi->s_group_info) *
+ num_meta_group_infos_max)
+ array_size = array_size << 1;
/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
* kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
* So a two level scheme suffices for now. */
- sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
- num_meta_group_infos, GFP_KERNEL);
+ sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
if (sbi->s_group_info == NULL) {
printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
return -ENOMEM;
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
sbi->s_group_info[i] = meta_group_info;
}
- /*
- * calculate needed size. if change bb_counters size,
- * don't forget about ext4_mb_generate_buddy()
- */
- len = sizeof(struct ext4_group_info);
- len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
for (i = 0; i < sbi->s_groups_count; i++) {
- struct ext4_group_desc *desc;
-
- meta_group_info =
- sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
- j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-
- meta_group_info[j] = kzalloc(len, GFP_KERNEL);
- if (meta_group_info[j] == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
- goto err_freebuddy;
- }
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
printk(KERN_ERR
"EXT4-fs: can't read descriptor %lu\n", i);
- i++;
goto err_freebuddy;
}
- memset(meta_group_info[j], 0, len);
- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
- &(meta_group_info[j]->bb_state));
-
- /*
- * initialize bb_free to be able to skip
- * empty groups without initialization
- */
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- meta_group_info[j]->bb_free =
- ext4_free_blocks_after_init(sb, i, desc);
- } else {
- meta_group_info[j]->bb_free =
- le16_to_cpu(desc->bg_free_blocks_count);
- }
-
- INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-
-#ifdef DOUBLE_CHECK
- {
- struct buffer_head *bh;
- meta_group_info[j]->bb_bitmap =
- kmalloc(sb->s_blocksize, GFP_KERNEL);
- BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
- bh = read_block_bitmap(sb, i);
- BUG_ON(bh == NULL);
- memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
- sb->s_blocksize);
- put_bh(bh);
- }
-#endif
-
+ if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+ goto err_freebuddy;
}
return 0;
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
unsigned i;
unsigned offset;
unsigned max;
+ int ret;
if (!test_opt(sb, MBALLOC))
return 0;
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
} while (i <= sb->s_blocksize_bits + 1);
/* init file for buddy data */
- i = ext4_mb_init_backend(sb);
- if (i) {
+ ret = ext4_mb_init_backend(sb);
+ if (ret != 0) {
clear_opt(sbi->s_mount_opt, MBALLOC);
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
- return i;
+ return ret;
}
spin_lock_init(&sbi->s_md_lock);
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
ext4_lock_group(sb, md->group);
for (i = 0; i < md->num; i++) {
mb_debug(" %u", md->blocks[i]);
- err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
- BUG_ON(err != 0);
+ mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
}
mb_debug("\n");
ext4_unlock_group(sb, md->group);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
-#define MB_PROC_VALUE_READ(name) \
-static int ext4_mb_read_##name(char *page, char **start, \
- off_t off, int count, int *eof, void *data) \
+#define MB_PROC_FOPS(name) \
+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
{ \
- struct ext4_sb_info *sbi = data; \
- int len; \
- *eof = 1; \
- if (off != 0) \
- return 0; \
- len = sprintf(page, "%ld\n", sbi->s_mb_##name); \
- *start = page; \
- return len; \
-}
-
-#define MB_PROC_VALUE_WRITE(name) \
-static int ext4_mb_write_##name(struct file *file, \
- const char __user *buf, unsigned long cnt, void *data) \
+ struct ext4_sb_info *sbi = m->private; \
+ \
+ seq_printf(m, "%ld\n", sbi->s_mb_##name); \
+ return 0; \
+} \
+ \
+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
+{ \
+ return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
+} \
+ \
+static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
+ const char __user *buf, size_t cnt, loff_t *ppos) \
{ \
- struct ext4_sb_info *sbi = data; \
+ struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
char str[32]; \
long value; \
if (cnt >= sizeof(str)) \
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file, \
return -ERANGE; \
sbi->s_mb_##name = value; \
return cnt; \
-}
+} \
+ \
+static const struct file_operations ext4_mb_##name##_proc_fops = { \
+ .owner = THIS_MODULE, \
+ .open = ext4_mb_##name##_proc_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+ .write = ext4_mb_##name##_proc_write, \
+};
-MB_PROC_VALUE_READ(stats);
-MB_PROC_VALUE_WRITE(stats);
-MB_PROC_VALUE_READ(max_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
-MB_PROC_VALUE_READ(min_to_scan);
-MB_PROC_VALUE_WRITE(min_to_scan);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
+MB_PROC_FOPS(stats);
+MB_PROC_FOPS(max_to_scan);
+MB_PROC_FOPS(min_to_scan);
+MB_PROC_FOPS(order2_reqs);
+MB_PROC_FOPS(stream_request);
+MB_PROC_FOPS(group_prealloc);
#define MB_PROC_HANDLER(name, var) \
do { \
- proc = create_proc_entry(name, mode, sbi->s_mb_proc); \
+ proc = proc_create_data(name, mode, sbi->s_mb_proc, \
+ &ext4_mb_##var##_proc_fops, sbi); \
if (proc == NULL) { \
printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
goto err_out; \
} \
- proc->data = sbi; \
- proc->read_proc = ext4_mb_read_##var ; \
- proc->write_proc = ext4_mb_write_##var; \
} while (0)
static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
struct proc_dir_entry *proc;
char devname[64];
+ if (proc_root_ext4 == NULL) {
+ sbi->s_mb_proc = NULL;
+ return -EINVAL;
+ }
bdevname(sb->s_bdev, devname);
sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
@@ -2747,7 +2895,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
err = -EIO;
- bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
if (!bitmap_bh)
goto out_err;
@@ -2816,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
- percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+
+ /*
+ * free blocks account has already be reduced/reserved
+ * at write_begin() time for delayed allocation
+ * do not double accounting
+ */
+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+ percpu_counter_sub(&sbi->s_freeblocks_counter,
+ ac->ac_b_ex.fe_len);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi,
+ ac->ac_b_ex.fe_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
if (err)
@@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
if (bit >= end)
break;
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
- if (next > end)
- next = end;
start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
le32_to_cpu(sbi->s_es->s_first_data_block);
mb_debug(" free preallocated %u/%u in group %u\n",
@@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
if (list_empty(&grp->bb_prealloc_list))
return 0;
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
/* error handling here */
ext4_mb_release_desc(&e4b);
@@ -3743,7 +3905,7 @@ repeat:
err = ext4_mb_load_buddy(sb, group, &e4b);
BUG_ON(err != 0); /* error handling here */
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
/* error handling here */
ext4_mb_release_desc(&e4b);
@@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
sbi = EXT4_SB(sb);
if (!test_opt(sb, MBALLOC)) {
- block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+ block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
&(ar->len), errp);
return block;
}
+ if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+ /*
+ * With delalloc we already reserved the blocks
+ */
+ ar->len = ext4_has_free_blocks(sbi, ar->len);
+ }
+
+ if (ar->len == 0) {
+ *errp = -ENOSPC;
+ return 0;
+ }
while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
inquota = ar->len;
+ if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+ ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
+ ar->len = 0;
*errp = -ENOMEM;
- return 0;
+ goto out1;
}
ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
*errp = ext4_mb_initialize_context(ac, ar);
if (*errp) {
ar->len = 0;
- goto out;
+ goto out2;
}
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
if (!ext4_mb_use_preallocated(ac)) {
-
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
repeat:
@@ -4085,11 +4261,12 @@ repeat:
ext4_mb_release_context(ac);
-out:
+out2:
+ kmem_cache_free(ext4_ac_cachep, ac);
+out1:
if (ar->len < inquota)
DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
- kmem_cache_free(ext4_ac_cachep, ac);
return block;
}
static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,7 +4419,7 @@ do_more:
overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
}
- bitmap_bh = read_block_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4309,10 +4486,9 @@ do_more:
ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
} else {
ext4_lock_group(sb, block_group);
- err = mb_free_blocks(inode, &e4b, bit, count);
+ mb_free_blocks(inode, &e4b, bit, count);
ext4_mb_return_to_preallocation(inode, &e4b, block, count);
ext4_unlock_group(sb, block_group);
- BUG_ON(err != 0);
}
spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4321,6 +4497,13 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
ext4_mb_release_desc(&e4b);
*freed += count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830d..387ad98350c3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+ return (struct ext4_dir_entry_2 *)((char *)p +
+ ext4_rec_len_from_disk(p->rec_len));
+}
+
+/*
* Future: use high four bits of block for coalesce-on-delete flags
* Mask them off for now.
*/
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
EXT4_DIR_REC_LEN(2) - infosize;
- return 0? 20: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
static inline unsigned dx_node_limit (struct inode *dir)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
- return 0? 22: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
/*
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
/*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
- return (struct ext4_dir_entry_2 *)((char *)p +
- ext4_rec_len_from_disk(p->rec_len));
-}
-
-/*
* This function fills a red-black tree with information from a
* directory block. It returns the number directory entries loaded
* into the tree. If there is an error it is returned in err.
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
de = (struct ext4_dir_entry_2 *) bh->b_data;
top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de))
- if (ext4_match (namelen, name, de)) {
- if (!ext4_check_dir_entry("ext4_find_entry",
- dir, de, bh,
- (block<<EXT4_BLOCK_SIZE_BITS(sb))
- +((char *)de - bh->b_data))) {
- brelse (bh);
+ for (; de < top; de = ext4_next_entry(de)) {
+ int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
+ + ((char *) de - bh->b_data);
+
+ if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
+ brelse(bh);
*err = ERR_BAD_DX_DIR;
goto errout;
}
- *res_dir = de;
- dx_release (frames);
- return bh;
+
+ if (ext4_match(namelen, name, de)) {
+ *res_dir = de;
+ dx_release(frames);
+ return bh;
+ }
}
brelse (bh);
/* Check to see if we should continue to search */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ff7b1c04239..f000fbe2cd93 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
/*
+ * We can allocate memory for mb_alloc based on the new group
+ * descriptor
+ */
+ if (test_opt(sb, MBALLOC)) {
+ err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+ if (err)
+ goto exit_journal;
+ }
+ /*
* Make the new blocks and inodes valid next. We do this before
* increasing the group count so that once the group is enabled,
* all of its blocks and inodes are already valid.
@@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
handle_t *handle;
int err;
unsigned long freed_blocks;
+ ext4_group_t group;
+ struct ext4_group_info *grp;
/* We don't need to worry about locking wrt other resizers just
* yet: we're going to revalidate es->s_blocks_count after
@@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
}
/* Handle the remaining blocks in the last group only. */
- ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
if (last == 0) {
ext4_warning(sb, __func__,
@@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
o_blocks_count + add);
if ((err = ext4_journal_stop(handle)))
goto exit_put;
+
+ /*
+ * Mark mballoc pages as not up to date so that they will be updated
+ * next time they are loaded by ext4_mb_load_buddy.
+ */
+ if (test_opt(sb, MBALLOC)) {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ int blocks_per_page;
+ int block;
+ int pnum;
+ struct page *page;
+
+ /* Set buddy page as not up to date */
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Set bitmap page as not up to date */
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Get the info on the last group */
+ grp = ext4_get_group_info(sb, group);
+
+ /* Update free blocks in group info */
+ ext4_mb_update_group_info(grp, add);
+ }
+
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 02bf24343979..1cb371dcd609 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
+ kfree(sbi->s_flex_groups);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
+ jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+ ei->i_reserved_data_blocks = 0;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_allocated_meta_blocks = 0;
+ ei->i_delalloc_reserved_flag = 0;
+ spin_lock_init(&(ei->i_block_reservation_lock));
return &ei->vfs_inode;
}
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode)
EXT4_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
kfree(rsv);
+ jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+ &EXT4_I(inode)->jinode);
}
static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
unsigned long def_mount_opts;
struct super_block *sb = vfs->mnt_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- journal_t *journal = sbi->s_journal;
struct ext4_super_block *es = sbi->s_es;
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",nomballoc");
if (test_opt(sb, I_VERSION))
seq_puts(seq, ",i_version");
+ if (!test_opt(sb, DELALLOC))
+ seq_puts(seq, ",nodelalloc");
+
if (sbi->s_stripe)
seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -894,7 +906,7 @@ enum {
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
- Opt_mballoc, Opt_nomballoc, Opt_stripe,
+ Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
};
static match_table_t tokens = {
@@ -953,6 +965,8 @@ static match_table_t tokens = {
{Opt_nomballoc, "nomballoc"},
{Opt_stripe, "stripe=%u"},
{Opt_resize, "resize"},
+ {Opt_delalloc, "delalloc"},
+ {Opt_nodelalloc, "nodelalloc"},
{Opt_err, NULL},
};
@@ -990,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb,
int qtype, qfmt;
char *qname;
#endif
+ ext4_fsblk_t last_block;
if (!options)
return 1;
@@ -1309,15 +1324,39 @@ set_qf_format:
clear_opt(sbi->s_mount_opt, NOBH);
break;
case Opt_extents:
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_warning(sb, __func__,
+ "extents feature not enabled "
+ "on this filesystem, use tune2fs\n");
+ return 0;
+ }
set_opt (sbi->s_mount_opt, EXTENTS);
break;
case Opt_noextents:
+ /*
+ * When e2fsprogs support resizing an already existing
+ * ext3 file system to greater than 2**32 we need to
+ * add support to block allocator to handle growing
+ * already existing block mapped inode so that blocks
+ * allocated for them fall within 2**32
+ */
+ last_block = ext4_blocks_count(sbi->s_es) - 1;
+ if (last_block > 0xffffffffULL) {
+ printk(KERN_ERR "EXT4-fs: Filesystem too "
+ "large to mount with "
+ "-o noextents options\n");
+ return 0;
+ }
clear_opt (sbi->s_mount_opt, EXTENTS);
break;
case Opt_i_version:
set_opt(sbi->s_mount_opt, I_VERSION);
sb->s_flags |= MS_I_VERSION;
break;
+ case Opt_nodelalloc:
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ break;
case Opt_mballoc:
set_opt(sbi->s_mount_opt, MBALLOC);
break;
@@ -1331,6 +1370,9 @@ set_qf_format:
return 0;
sbi->s_stripe = option;
break;
+ case Opt_delalloc:
+ set_opt(sbi->s_mount_opt, DELALLOC);
+ break;
default:
printk (KERN_ERR
"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1443,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
return res;
}
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = NULL;
+ struct buffer_head *bh;
+ ext4_group_t flex_group_count;
+ ext4_group_t flex_group;
+ int groups_per_flex = 0;
+ __u64 block_bitmap = 0;
+ int i;
+
+ if (!sbi->s_es->s_log_groups_per_flex) {
+ sbi->s_log_groups_per_flex = 0;
+ return 1;
+ }
+
+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+ flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+ groups_per_flex;
+ sbi->s_flex_groups = kmalloc(flex_group_count *
+ sizeof(struct flex_groups), GFP_KERNEL);
+ if (sbi->s_flex_groups == NULL) {
+ printk(KERN_ERR "EXT4-fs: not enough memory\n");
+ goto failed;
+ }
+ memset(sbi->s_flex_groups, 0, flex_group_count *
+ sizeof(struct flex_groups));
+
+ gdp = ext4_get_group_desc(sb, 1, &bh);
+ block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ gdp = ext4_get_group_desc(sb, i, &bh);
+
+ flex_group = ext4_flex_group(sbi, i);
+ sbi->s_flex_groups[flex_group].free_inodes +=
+ le16_to_cpu(gdp->bg_free_inodes_count);
+ sbi->s_flex_groups[flex_group].free_blocks +=
+ le16_to_cpu(gdp->bg_free_blocks_count);
+ }
+
+ return 1;
+failed:
+ return 0;
+}
+
__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
struct ext4_group_desc *gdp)
{
@@ -1810,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
}
static int ext4_fill_super (struct super_block *sb, void *data, int silent)
- __releases(kernel_sem)
- __acquires(kernel_sem)
+ __releases(kernel_lock)
+ __acquires(kernel_lock)
{
struct buffer_head * bh;
@@ -1851,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
goto out_fail;
}
- if (!sb_set_blocksize(sb, blocksize)) {
- printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
- goto out_fail;
- }
-
/*
* The ext4 superblock will not be buffer aligned for other than 1kB
* block sizes. We need to calculate the offset from buffer start.
@@ -1919,15 +2004,28 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
/*
* turn on extents feature by default in ext4 filesystem
- * User -o noextents to turn it off
+ * only if feature flag already set by mkfs or tune2fs.
+ * Use -o noextents to turn it off
*/
- set_opt(sbi->s_mount_opt, EXTENTS);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ set_opt(sbi->s_mount_opt, EXTENTS);
+ else
+ ext4_warning(sb, __func__,
+ "extents feature not enabled on this filesystem, "
+ "use tune2fs.\n");
/*
- * turn on mballoc feature by default in ext4 filesystem
- * User -o nomballoc to turn it off
+ * turn on mballoc code by default in ext4 filesystem
+ * Use -o nomballoc to turn it off
*/
set_opt(sbi->s_mount_opt, MBALLOC);
+ /*
+ * enable delayed allocation by default
+ * Use -o nodelalloc to turn it off
+ */
+ set_opt(sbi->s_mount_opt, DELALLOC);
+
+
if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
NULL, 0))
goto failed_mount;
@@ -2138,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
goto failed_mount2;
}
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (!ext4_fill_flex_info(sb)) {
+ printk(KERN_ERR
+ "EXT4-fs: unable to initialize "
+ "flex_bg meta info!\n");
+ goto failed_mount2;
+ }
+
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
@@ -2358,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+ "requested data journaling mode\n");
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ } else if (test_opt(sb, DELALLOC))
+ printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
ext4_ext_init(sb);
ext4_mb_init(sb, needs_recovery);
@@ -2372,6 +2485,7 @@ cantfind_ext4:
failed_mount4:
jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
failed_mount3:
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3325,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
err = ext4_journal_dirty_metadata(handle, bh);
else {
/* Always do at least ordered writes for quotas */
- err = ext4_journal_dirty_data(handle, bh);
+ err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh);
}
brelse(bh);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398e..93c5fdcdad2e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
/* We need to allocate a new block */
ext4_fsblk_t goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
- ext4_fsblk_t block = ext4_new_block(handle, inode,
+ ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
goal, &error);
if (error)
goto cleanup;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cadc..ac1a52cf2a37 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
#include "ext4.h"
#include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
-
static size_t
ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+ const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4e..d91aa61b42aa 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
#include "ext4.h"
#include "xattr.h"
-#define XATTR_USER_PREFIX "user."
-
static size_t
ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index fda25479af26..3a9ecac8d61f 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -61,7 +61,7 @@ void fat_cache_destroy(void)
static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
{
- return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL);
+ return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
}
static inline void fat_cache_free(struct fat_cache *cache)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 486725ee99ae..34541d06e626 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -472,7 +472,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
loff_t cpos;
int ret = 0;
- lock_kernel();
+ lock_super(sb);
cpos = filp->f_pos;
/* Fake . and .. for the root directory. */
@@ -654,7 +654,7 @@ FillFailed:
if (unicode)
__putname(unicode);
out:
- unlock_kernel();
+ unlock_super(sb);
return ret;
}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 771326b8047e..c672df4036e9 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,7 +11,6 @@
#include <linux/mount.h>
#include <linux/time.h>
#include <linux/msdos_fs.h>
-#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
@@ -242,9 +241,7 @@ void fat_truncate(struct inode *inode)
nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
- lock_kernel();
fat_free(inode, nr_clusters);
- unlock_kernel();
fat_flush_inodes(inode->i_sb, inode, NULL);
}
@@ -310,8 +307,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
int error = 0;
unsigned int ia_valid;
- lock_kernel();
-
/*
* Expand the file. Since inode_setattr() updates ->i_size
* before calling the ->truncate(), but FAT needs to fill the
@@ -366,7 +361,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
error = inode_setattr(inode, attr);
out:
- unlock_kernel();
return error;
}
EXPORT_SYMBOL_GPL(fat_setattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4e0a3dd9d677..46a4508ffd2e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -440,14 +440,13 @@ static void fat_delete_inode(struct inode *inode)
static void fat_clear_inode(struct inode *inode)
{
- struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
- lock_kernel();
spin_lock(&sbi->inode_hash_lock);
fat_cache_inval_inode(inode);
hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
spin_unlock(&sbi->inode_hash_lock);
- unlock_kernel();
}
static void fat_write_super(struct super_block *sb)
@@ -485,7 +484,7 @@ static struct kmem_cache *fat_inode_cachep;
static struct inode *fat_alloc_inode(struct super_block *sb)
{
struct msdos_inode_info *ei;
- ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
return &ei->vfs_inode;
@@ -567,7 +566,7 @@ retry:
if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
return 0;
- lock_kernel();
+ lock_super(sb);
bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
if (!bh) {
printk(KERN_ERR "FAT: unable to read inode block "
@@ -579,7 +578,7 @@ retry:
if (i_pos != MSDOS_I(inode)->i_pos) {
spin_unlock(&sbi->inode_hash_lock);
brelse(bh);
- unlock_kernel();
+ unlock_super(sb);
goto retry;
}
@@ -606,7 +605,7 @@ retry:
err = sync_dirty_buffer(bh);
brelse(bh);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -736,6 +735,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
static struct dentry *fat_get_parent(struct dentry *child)
{
+ struct super_block *sb = child->d_sb;
struct buffer_head *bh;
struct msdos_dir_entry *de;
loff_t i_pos;
@@ -743,14 +743,14 @@ static struct dentry *fat_get_parent(struct dentry *child)
struct inode *inode;
int err;
- lock_kernel();
+ lock_super(sb);
err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
if (err) {
parent = ERR_PTR(err);
goto out;
}
- inode = fat_build_inode(child->d_sb, de, i_pos);
+ inode = fat_build_inode(sb, de, i_pos);
brelse(bh);
if (IS_ERR(inode)) {
parent = ERR_CAST(inode);
@@ -762,7 +762,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
parent = ERR_PTR(-ENOMEM);
}
out:
- unlock_kernel();
+ unlock_super(sb);
return parent;
}
@@ -1172,6 +1172,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
long error;
char buf[50];
+ /*
+ * GFP_KERNEL is ok here, because while we do hold the
+ * supeblock lock, memory pressure can't call back into
+ * the filesystem, since we're only just about to mount
+ * it and have no inodes etc active!
+ */
sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfd776509a72..330a7d782591 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
#include <linux/fdtable.h>
#include <linux/capability.h>
#include <linux/dnotify.h>
-#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/security.h>
@@ -227,7 +226,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
if (error)
return error;
- lock_kernel();
if ((arg ^ filp->f_flags) & FASYNC) {
if (filp->f_op && filp->f_op->fasync) {
error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -238,7 +236,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
out:
- unlock_kernel();
return error;
}
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 7f7947e3dfbb..ab2f57e3fb87 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -14,23 +14,11 @@ config GFS2_FS
GFS is perfect consistency -- changes made to the filesystem on one
machine show up immediately on all other machines in the cluster.
- To use the GFS2 filesystem, you will need to enable one or more of
- the below locking modules. Documentation and utilities for GFS2 can
+ To use the GFS2 filesystem in a cluster, you will need to enable
+ the locking module below. Documentation and utilities for GFS2 can
be found here: http://sources.redhat.com/cluster
-config GFS2_FS_LOCKING_NOLOCK
- tristate "GFS2 \"nolock\" locking module"
- depends on GFS2_FS
- help
- Single node locking module for GFS2.
-
- Use this module if you want to use GFS2 on a single node without
- its clustering features. You can still take advantage of the
- large file support, and upgrade to running a full cluster later on
- if required.
-
- If you will only be using GFS2 in cluster mode, you do not need this
- module.
+ The "nolock" lock module is now built in to GFS2 by default.
config GFS2_FS_LOCKING_DLM
tristate "GFS2 DLM locking module"
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index e2350df02a07..ec65851ec80a 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -5,6 +5,5 @@ gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
ops_fstype.o ops_inode.o ops_super.o quota.o \
recovery.o rgrp.o super.o sys.o trans.o util.o
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
index 3bb11c0f8b56..ef606e3a5cf4 100644
--- a/fs/gfs2/gfs2.h
+++ b/fs/gfs2/gfs2.h
@@ -16,11 +16,6 @@ enum {
};
enum {
- NO_WAIT = 0,
- WAIT = 1,
-};
-
-enum {
NO_FORCE = 0,
FORCE = 1,
};
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d636b3e80f5d..13391e546616 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -45,21 +45,19 @@ struct gfs2_gl_hash_bucket {
struct hlist_head hb_list;
};
-struct glock_iter {
- int hash; /* hash bucket index */
- struct gfs2_sbd *sdp; /* incore superblock */
- struct gfs2_glock *gl; /* current glock struct */
- struct seq_file *seq; /* sequence file for debugfs */
- char string[512]; /* scratch space */
+struct gfs2_glock_iter {
+ int hash; /* hash bucket index */
+ struct gfs2_sbd *sdp; /* incore superblock */
+ struct gfs2_glock *gl; /* current glock struct */
+ char string[512]; /* scratch space */
};
typedef void (*glock_examiner) (struct gfs2_glock * gl);
static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
-static void gfs2_glock_drop_th(struct gfs2_glock *gl);
-static void run_queue(struct gfs2_glock *gl);
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
static DECLARE_RWSEM(gfs2_umount_flush_sem);
static struct dentry *gfs2_root;
@@ -123,33 +121,6 @@ static inline rwlock_t *gl_lock_addr(unsigned int x)
#endif
/**
- * relaxed_state_ok - is a requested lock compatible with the current lock mode?
- * @actual: the current state of the lock
- * @requested: the lock state that was requested by the caller
- * @flags: the modifier flags passed in by the caller
- *
- * Returns: 1 if the locks are compatible, 0 otherwise
- */
-
-static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
- int flags)
-{
- if (actual == requested)
- return 1;
-
- if (flags & GL_EXACT)
- return 0;
-
- if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
- return 1;
-
- if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
- return 1;
-
- return 0;
-}
-
-/**
* gl_hash() - Turn glock number into hash bucket number
* @lock: The glock number
*
@@ -182,7 +153,7 @@ static void glock_free(struct gfs2_glock *gl)
struct gfs2_sbd *sdp = gl->gl_sbd;
struct inode *aspace = gl->gl_aspace;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
if (aspace)
@@ -211,17 +182,14 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
int gfs2_glock_put(struct gfs2_glock *gl)
{
int rv = 0;
- struct gfs2_sbd *sdp = gl->gl_sbd;
write_lock(gl_lock_addr(gl->gl_hash));
if (atomic_dec_and_test(&gl->gl_ref)) {
hlist_del(&gl->gl_list);
write_unlock(gl_lock_addr(gl->gl_hash));
- gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
- gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
- gfs2_assert(sdp, list_empty(&gl->gl_holders));
- gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
- gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
+ GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
+ GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
+ GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
glock_free(gl);
rv = 1;
goto out;
@@ -281,22 +249,401 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
return gl;
}
+/**
+ * may_grant - check if its ok to grant a new lock
+ * @gl: The glock
+ * @gh: The lock request which we wish to grant
+ *
+ * Returns: true if its ok to grant the lock
+ */
+
+static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
+{
+ const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
+ if ((gh->gh_state == LM_ST_EXCLUSIVE ||
+ gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
+ return 0;
+ if (gl->gl_state == gh->gh_state)
+ return 1;
+ if (gh->gh_flags & GL_EXACT)
+ return 0;
+ if (gl->gl_state == LM_ST_EXCLUSIVE) {
+ if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
+ return 1;
+ if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
+ return 1;
+ }
+ if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
+ return 1;
+ return 0;
+}
+
+static void gfs2_holder_wake(struct gfs2_holder *gh)
+{
+ clear_bit(HIF_WAIT, &gh->gh_iflags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&gh->gh_iflags, HIF_WAIT);
+}
+
+/**
+ * do_promote - promote as many requests as possible on the current queue
+ * @gl: The glock
+ *
+ * Returns: true if there is a blocked holder at the head of the list
+ */
+
+static int do_promote(struct gfs2_glock *gl)
+{
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_holder *gh, *tmp;
+ int ret;
+
+restart:
+ list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ continue;
+ if (may_grant(gl, gh)) {
+ if (gh->gh_list.prev == &gl->gl_holders &&
+ glops->go_lock) {
+ spin_unlock(&gl->gl_spin);
+ /* FIXME: eliminate this eventually */
+ ret = glops->go_lock(gh);
+ spin_lock(&gl->gl_spin);
+ if (ret) {
+ gh->gh_error = ret;
+ list_del_init(&gh->gh_list);
+ gfs2_holder_wake(gh);
+ goto restart;
+ }
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+ gfs2_holder_wake(gh);
+ goto restart;
+ }
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+ gfs2_holder_wake(gh);
+ continue;
+ }
+ if (gh->gh_list.prev == &gl->gl_holders)
+ return 1;
+ break;
+ }
+ return 0;
+}
+
+/**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+ struct gfs2_holder *gh, *tmp;
+
+ list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ continue;
+ if (ret & LM_OUT_ERROR)
+ gh->gh_error = -EIO;
+ else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+ gh->gh_error = GLR_TRYFAILED;
+ else
+ continue;
+ list_del_init(&gh->gh_list);
+ gfs2_holder_wake(gh);
+ }
+}
+
+/**
+ * find_first_waiter - find the first gh that's waiting for the glock
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return gh;
+ }
+ return NULL;
+}
+
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+ int held1, held2;
+
+ held1 = (gl->gl_state != LM_ST_UNLOCKED);
+ held2 = (new_state != LM_ST_UNLOCKED);
+
+ if (held1 != held2) {
+ if (held2)
+ gfs2_glock_hold(gl);
+ else
+ gfs2_glock_put(gl);
+ }
+
+ gl->gl_state = new_state;
+ gl->gl_tchange = jiffies;
+}
+
+static void gfs2_demote_wake(struct gfs2_glock *gl)
+{
+ gl->gl_demote_state = LM_ST_EXCLUSIVE;
+ clear_bit(GLF_DEMOTE, &gl->gl_flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
+}
+
+/**
+ * finish_xmote - The DLM has replied to one of our lock requests
+ * @gl: The glock
+ * @ret: The status from the DLM
+ *
+ */
+
+static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
+{
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_holder *gh;
+ unsigned state = ret & LM_OUT_ST_MASK;
+
+ spin_lock(&gl->gl_spin);
+ state_change(gl, state);
+ gh = find_first_waiter(gl);
+
+ /* Demote to UN request arrived during demote to SH or DF */
+ if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+ state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
+ gl->gl_target = LM_ST_UNLOCKED;
+
+ /* Check for state != intended state */
+ if (unlikely(state != gl->gl_target)) {
+ if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
+ /* move to back of queue and try next entry */
+ if (ret & LM_OUT_CANCELED) {
+ if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0)
+ list_move_tail(&gh->gh_list, &gl->gl_holders);
+ gh = find_first_waiter(gl);
+ gl->gl_target = gh->gh_state;
+ goto retry;
+ }
+ /* Some error or failed "try lock" - report it */
+ if ((ret & LM_OUT_ERROR) ||
+ (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
+ gl->gl_target = gl->gl_state;
+ do_error(gl, ret);
+ goto out;
+ }
+ }
+ switch(state) {
+ /* Unlocked due to conversion deadlock, try again */
+ case LM_ST_UNLOCKED:
+retry:
+ do_xmote(gl, gh, gl->gl_target);
+ break;
+ /* Conversion fails, unlock and try again */
+ case LM_ST_SHARED:
+ case LM_ST_DEFERRED:
+ do_xmote(gl, gh, LM_ST_UNLOCKED);
+ break;
+ default: /* Everything else */
+ printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state);
+ GLOCK_BUG_ON(gl, 1);
+ }
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_put(gl);
+ return;
+ }
+
+ /* Fast path - we got what we asked for */
+ if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
+ gfs2_demote_wake(gl);
+ if (state != LM_ST_UNLOCKED) {
+ if (glops->go_xmote_bh) {
+ int rv;
+ spin_unlock(&gl->gl_spin);
+ rv = glops->go_xmote_bh(gl, gh);
+ if (rv == -EAGAIN)
+ return;
+ spin_lock(&gl->gl_spin);
+ if (rv) {
+ do_error(gl, rv);
+ goto out;
+ }
+ }
+ do_promote(gl);
+ }
+out:
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_put(gl);
+}
+
+static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+ unsigned int cur_state, unsigned int req_state,
+ unsigned int flags)
+{
+ int ret = LM_OUT_ERROR;
+
+ if (!sdp->sd_lockstruct.ls_ops->lm_lock)
+ return req_state == LM_ST_UNLOCKED ? 0 : req_state;
+
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+ req_state, flags);
+ return ret;
+}
+
+/**
+ * do_xmote - Calls the DLM to change the state of a lock
+ * @gl: The lock state
+ * @gh: The holder (only for promotes)
+ * @target: The target lock state
+ *
+ */
+
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+{
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ unsigned int lck_flags = gh ? gh->gh_flags : 0;
+ int ret;
+
+ lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
+ LM_FLAG_PRIORITY);
+ BUG_ON(gl->gl_state == target);
+ BUG_ON(gl->gl_state == gl->gl_target);
+ if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
+ glops->go_inval) {
+ set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+ do_error(gl, 0); /* Fail queued try locks */
+ }
+ spin_unlock(&gl->gl_spin);
+ if (glops->go_xmote_th)
+ glops->go_xmote_th(gl);
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+ glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
+ clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+
+ gfs2_glock_hold(gl);
+ if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED ||
+ gl->gl_state == LM_ST_DEFERRED) &&
+ !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+ lck_flags |= LM_FLAG_TRY_1CB;
+ ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags);
+
+ if (!(ret & LM_OUT_ASYNC)) {
+ finish_xmote(gl, ret);
+ gfs2_glock_hold(gl);
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
+ } else {
+ GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
+ }
+ spin_lock(&gl->gl_spin);
+}
+
+/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ if (!list_empty(&gl->gl_holders)) {
+ gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return gh;
+ }
+ return NULL;
+}
+
+/**
+ * run_queue - do all outstanding tasks related to a glock
+ * @gl: The glock in question
+ * @nonblock: True if we must not block in run_queue
+ *
+ */
+
+static void run_queue(struct gfs2_glock *gl, const int nonblock)
+{
+ struct gfs2_holder *gh = NULL;
+
+ if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
+ return;
+
+ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
+
+ if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
+ gl->gl_demote_state != gl->gl_state) {
+ if (find_first_holder(gl))
+ goto out;
+ if (nonblock)
+ goto out_sched;
+ set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+ GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
+ gl->gl_target = gl->gl_demote_state;
+ } else {
+ if (test_bit(GLF_DEMOTE, &gl->gl_flags))
+ gfs2_demote_wake(gl);
+ if (do_promote(gl) == 0)
+ goto out;
+ gh = find_first_waiter(gl);
+ gl->gl_target = gh->gh_state;
+ if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+ do_error(gl, 0); /* Fail queued try locks */
+ }
+ do_xmote(gl, gh, gl->gl_target);
+ return;
+
+out_sched:
+ gfs2_glock_hold(gl);
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
+out:
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+}
+
static void glock_work_func(struct work_struct *work)
{
+ unsigned long delay = 0;
struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+ if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+ finish_xmote(gl, gl->gl_reply);
spin_lock(&gl->gl_spin);
- if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags))
- set_bit(GLF_DEMOTE, &gl->gl_flags);
- run_queue(gl);
+ if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+ gl->gl_state != LM_ST_UNLOCKED &&
+ gl->gl_demote_state != LM_ST_EXCLUSIVE) {
+ unsigned long holdtime, now = jiffies;
+ holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
+ if (time_before(now, holdtime))
+ delay = holdtime - now;
+ set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags);
+ }
+ run_queue(gl, 0);
spin_unlock(&gl->gl_spin);
- gfs2_glock_put(gl);
+ if (!delay ||
+ queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+ gfs2_glock_put(gl);
}
static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
void **lockp)
{
int error = -EIO;
+ if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
+ return 0;
if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
sdp->sd_lockstruct.ls_lockspace, name, lockp);
@@ -342,12 +689,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
gl->gl_name = name;
atomic_set(&gl->gl_ref, 1);
gl->gl_state = LM_ST_UNLOCKED;
+ gl->gl_target = LM_ST_UNLOCKED;
gl->gl_demote_state = LM_ST_EXCLUSIVE;
gl->gl_hash = hash;
- gl->gl_owner_pid = NULL;
- gl->gl_ip = 0;
gl->gl_ops = glops;
- gl->gl_req_gh = NULL;
gl->gl_stamp = jiffies;
gl->gl_tchange = jiffies;
gl->gl_object = NULL;
@@ -447,13 +792,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
gh->gh_ip = 0;
}
-static void gfs2_holder_wake(struct gfs2_holder *gh)
-{
- clear_bit(HIF_WAIT, &gh->gh_iflags);
- smp_mb__after_clear_bit();
- wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-}
-
static int just_schedule(void *word)
{
schedule();
@@ -466,14 +804,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
}
-static void gfs2_demote_wake(struct gfs2_glock *gl)
-{
- gl->gl_demote_state = LM_ST_EXCLUSIVE;
- clear_bit(GLF_DEMOTE, &gl->gl_flags);
- smp_mb__after_clear_bit();
- wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
-}
-
static void wait_on_demote(struct gfs2_glock *gl)
{
might_sleep();
@@ -481,217 +811,6 @@ static void wait_on_demote(struct gfs2_glock *gl)
}
/**
- * rq_mutex - process a mutex request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_mutex(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
-
- list_del_init(&gh->gh_list);
- /* gh->gh_error never examined. */
- set_bit(GLF_LOCK, &gl->gl_flags);
- clear_bit(HIF_WAIT, &gh->gh_iflags);
- smp_mb();
- wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-
- return 1;
-}
-
-/**
- * rq_promote - process a promote request in the queue
- * @gh: the glock holder
- *
- * Acquire a new inter-node lock, or change a lock state to more restrictive.
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_promote(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
-
- if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
- if (list_empty(&gl->gl_holders)) {
- gl->gl_req_gh = gh;
- set_bit(GLF_LOCK, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- gfs2_glock_xmote_th(gh->gh_gl, gh);
- spin_lock(&gl->gl_spin);
- }
- return 1;
- }
-
- if (list_empty(&gl->gl_holders)) {
- set_bit(HIF_FIRST, &gh->gh_iflags);
- set_bit(GLF_LOCK, &gl->gl_flags);
- } else {
- struct gfs2_holder *next_gh;
- if (gh->gh_state == LM_ST_EXCLUSIVE)
- return 1;
- next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
- gh_list);
- if (next_gh->gh_state == LM_ST_EXCLUSIVE)
- return 1;
- }
-
- list_move_tail(&gh->gh_list, &gl->gl_holders);
- gh->gh_error = 0;
- set_bit(HIF_HOLDER, &gh->gh_iflags);
-
- gfs2_holder_wake(gh);
-
- return 0;
-}
-
-/**
- * rq_demote - process a demote request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_demote(struct gfs2_glock *gl)
-{
- if (!list_empty(&gl->gl_holders))
- return 1;
-
- if (gl->gl_state == gl->gl_demote_state ||
- gl->gl_state == LM_ST_UNLOCKED) {
- gfs2_demote_wake(gl);
- return 0;
- }
-
- set_bit(GLF_LOCK, &gl->gl_flags);
- set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-
- if (gl->gl_demote_state == LM_ST_UNLOCKED ||
- gl->gl_state != LM_ST_EXCLUSIVE) {
- spin_unlock(&gl->gl_spin);
- gfs2_glock_drop_th(gl);
- } else {
- spin_unlock(&gl->gl_spin);
- gfs2_glock_xmote_th(gl, NULL);
- }
-
- spin_lock(&gl->gl_spin);
- clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-
- return 0;
-}
-
-/**
- * run_queue - process holder structures on a glock
- * @gl: the glock
- *
- */
-static void run_queue(struct gfs2_glock *gl)
-{
- struct gfs2_holder *gh;
- int blocked = 1;
-
- for (;;) {
- if (test_bit(GLF_LOCK, &gl->gl_flags))
- break;
-
- if (!list_empty(&gl->gl_waiters1)) {
- gh = list_entry(gl->gl_waiters1.next,
- struct gfs2_holder, gh_list);
- blocked = rq_mutex(gh);
- } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
- blocked = rq_demote(gl);
- if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
- !blocked) {
- set_bit(GLF_DEMOTE, &gl->gl_flags);
- gl->gl_demote_state = LM_ST_UNLOCKED;
- }
- clear_bit(GLF_WAITERS2, &gl->gl_flags);
- } else if (!list_empty(&gl->gl_waiters3)) {
- gh = list_entry(gl->gl_waiters3.next,
- struct gfs2_holder, gh_list);
- blocked = rq_promote(gh);
- } else
- break;
-
- if (blocked)
- break;
- }
-}
-
-/**
- * gfs2_glmutex_lock - acquire a local lock on a glock
- * @gl: the glock
- *
- * Gives caller exclusive access to manipulate a glock structure.
- */
-
-static void gfs2_glmutex_lock(struct gfs2_glock *gl)
-{
- spin_lock(&gl->gl_spin);
- if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
- struct gfs2_holder gh;
-
- gfs2_holder_init(gl, 0, 0, &gh);
- set_bit(HIF_WAIT, &gh.gh_iflags);
- list_add_tail(&gh.gh_list, &gl->gl_waiters1);
- spin_unlock(&gl->gl_spin);
- wait_on_holder(&gh);
- gfs2_holder_uninit(&gh);
- } else {
- gl->gl_owner_pid = get_pid(task_pid(current));
- gl->gl_ip = (unsigned long)__builtin_return_address(0);
- spin_unlock(&gl->gl_spin);
- }
-}
-
-/**
- * gfs2_glmutex_trylock - try to acquire a local lock on a glock
- * @gl: the glock
- *
- * Returns: 1 if the glock is acquired
- */
-
-static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
-{
- int acquired = 1;
-
- spin_lock(&gl->gl_spin);
- if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
- acquired = 0;
- } else {
- gl->gl_owner_pid = get_pid(task_pid(current));
- gl->gl_ip = (unsigned long)__builtin_return_address(0);
- }
- spin_unlock(&gl->gl_spin);
-
- return acquired;
-}
-
-/**
- * gfs2_glmutex_unlock - release a local lock on a glock
- * @gl: the glock
- *
- */
-
-static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
-{
- struct pid *pid;
-
- spin_lock(&gl->gl_spin);
- clear_bit(GLF_LOCK, &gl->gl_flags);
- pid = gl->gl_owner_pid;
- gl->gl_owner_pid = NULL;
- gl->gl_ip = 0;
- run_queue(gl);
- spin_unlock(&gl->gl_spin);
-
- put_pid(pid);
-}
-
-/**
* handle_callback - process a demote request
* @gl: the glock
* @state: the state the caller wants us to change to
@@ -705,398 +824,45 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
{
int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
- spin_lock(&gl->gl_spin);
set_bit(bit, &gl->gl_flags);
if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
gl->gl_demote_state = state;
gl->gl_demote_time = jiffies;
if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
- gl->gl_object) {
+ gl->gl_object)
gfs2_glock_schedule_for_reclaim(gl);
- spin_unlock(&gl->gl_spin);
- return;
- }
} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
gl->gl_demote_state != state) {
- if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
- set_bit(GLF_WAITERS2, &gl->gl_flags);
- else
- gl->gl_demote_state = LM_ST_UNLOCKED;
- }
- spin_unlock(&gl->gl_spin);
-}
-
-/**
- * state_change - record that the glock is now in a different state
- * @gl: the glock
- * @new_state the new state
- *
- */
-
-static void state_change(struct gfs2_glock *gl, unsigned int new_state)
-{
- int held1, held2;
-
- held1 = (gl->gl_state != LM_ST_UNLOCKED);
- held2 = (new_state != LM_ST_UNLOCKED);
-
- if (held1 != held2) {
- if (held2)
- gfs2_glock_hold(gl);
- else
- gfs2_glock_put(gl);
+ gl->gl_demote_state = LM_ST_UNLOCKED;
}
-
- gl->gl_state = new_state;
- gl->gl_tchange = jiffies;
}
/**
- * drop_bh - Called after a lock module unlock completes
- * @gl: the glock
- * @ret: the return status
- *
- * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
- * Doesn't drop the reference on the glock the top half took out
- *
- */
-
-static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- struct gfs2_holder *gh = gl->gl_req_gh;
-
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
- gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
- gfs2_assert_warn(sdp, !ret);
-
- state_change(gl, LM_ST_UNLOCKED);
-
- if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
- spin_lock(&gl->gl_spin);
- gh->gh_error = 0;
- spin_unlock(&gl->gl_spin);
- gfs2_glock_xmote_th(gl, gl->gl_req_gh);
- gfs2_glock_put(gl);
- return;
- }
-
- spin_lock(&gl->gl_spin);
- gfs2_demote_wake(gl);
- clear_bit(GLF_LOCK, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- gfs2_glock_put(gl);
-}
-
-/**
- * xmote_bh - Called after the lock module is done acquiring a lock
- * @gl: The glock in question
- * @ret: the int returned from the lock module
- *
- */
-
-static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
- struct gfs2_holder *gh = gl->gl_req_gh;
- int op_done = 1;
-
- if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
- drop_bh(gl, ret);
- return;
- }
-
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
- gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
- gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
-
- state_change(gl, ret & LM_OUT_ST_MASK);
-
- /* Deal with each possible exit condition */
-
- if (!gh) {
- gl->gl_stamp = jiffies;
- if (ret & LM_OUT_CANCELED) {
- op_done = 0;
- } else {
- spin_lock(&gl->gl_spin);
- if (gl->gl_state != gl->gl_demote_state) {
- spin_unlock(&gl->gl_spin);
- gfs2_glock_drop_th(gl);
- gfs2_glock_put(gl);
- return;
- }
- gfs2_demote_wake(gl);
- spin_unlock(&gl->gl_spin);
- }
- } else {
- spin_lock(&gl->gl_spin);
- if (ret & LM_OUT_CONV_DEADLK) {
- gh->gh_error = 0;
- set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- gfs2_glock_drop_th(gl);
- gfs2_glock_put(gl);
- return;
- }
- list_del_init(&gh->gh_list);
- gh->gh_error = -EIO;
- if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- goto out;
- gh->gh_error = GLR_CANCELED;
- if (ret & LM_OUT_CANCELED)
- goto out;
- if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
- list_add_tail(&gh->gh_list, &gl->gl_holders);
- gh->gh_error = 0;
- set_bit(HIF_HOLDER, &gh->gh_iflags);
- set_bit(HIF_FIRST, &gh->gh_iflags);
- op_done = 0;
- goto out;
- }
- gh->gh_error = GLR_TRYFAILED;
- if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
- goto out;
- gh->gh_error = -EINVAL;
- if (gfs2_assert_withdraw(sdp, 0) == -1)
- fs_err(sdp, "ret = 0x%.8X\n", ret);
-out:
- spin_unlock(&gl->gl_spin);
- }
-
- if (glops->go_xmote_bh)
- glops->go_xmote_bh(gl);
-
- if (op_done) {
- spin_lock(&gl->gl_spin);
- gl->gl_req_gh = NULL;
- clear_bit(GLF_LOCK, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- }
-
- gfs2_glock_put(gl);
-
- if (gh)
- gfs2_holder_wake(gh);
-}
-
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
- unsigned int cur_state, unsigned int req_state,
- unsigned int flags)
-{
- int ret = 0;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
- req_state, flags);
- return ret;
-}
-
-/**
- * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
- * @gl: The glock in question
- * @state: the requested state
- * @flags: modifier flags to the lock call
- *
- */
-
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- int flags = gh ? gh->gh_flags : 0;
- unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
- int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
- LM_FLAG_NOEXP | LM_FLAG_ANY |
- LM_FLAG_PRIORITY);
- unsigned int lck_ret;
-
- if (glops->go_xmote_th)
- glops->go_xmote_th(gl);
- if (state == LM_ST_DEFERRED && glops->go_inval)
- glops->go_inval(gl, DIO_METADATA);
-
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
- gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
- gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
- gfs2_assert_warn(sdp, state != gl->gl_state);
-
- gfs2_glock_hold(gl);
-
- lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
-
- if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
- return;
-
- if (lck_ret & LM_OUT_ASYNC)
- gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
- else
- xmote_bh(gl, lck_ret);
-}
-
-static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
- unsigned int cur_state)
-{
- int ret = 0;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
- return ret;
-}
-
-/**
- * gfs2_glock_drop_th - call into the lock module to unlock a lock
- * @gl: the glock
- *
- */
-
-static void gfs2_glock_drop_th(struct gfs2_glock *gl)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
- unsigned int ret;
-
- if (glops->go_xmote_th)
- glops->go_xmote_th(gl);
- if (glops->go_inval)
- glops->go_inval(gl, DIO_METADATA);
-
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
- gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
- gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
-
- gfs2_glock_hold(gl);
-
- ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
-
- if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
- return;
-
- if (!ret)
- drop_bh(gl, ret);
- else
- gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
-}
-
-/**
- * do_cancels - cancel requests for locks stuck waiting on an expire flag
- * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
- *
- * Don't cancel GL_NOCANCEL requests.
- */
-
-static void do_cancels(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
-
- spin_lock(&gl->gl_spin);
-
- while (gl->gl_req_gh != gh &&
- !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
- !list_empty(&gh->gh_list)) {
- if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
- spin_unlock(&gl->gl_spin);
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
- msleep(100);
- spin_lock(&gl->gl_spin);
- } else {
- spin_unlock(&gl->gl_spin);
- msleep(100);
- spin_lock(&gl->gl_spin);
- }
- }
-
- spin_unlock(&gl->gl_spin);
-}
-
-/**
- * glock_wait_internal - wait on a glock acquisition
+ * gfs2_glock_wait - wait on a glock acquisition
* @gh: the glock holder
*
* Returns: 0 on success
*/
-static int glock_wait_internal(struct gfs2_holder *gh)
+int gfs2_glock_wait(struct gfs2_holder *gh)
{
- struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
-
- if (test_bit(HIF_ABORTED, &gh->gh_iflags))
- return -EIO;
-
- if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
- spin_lock(&gl->gl_spin);
- if (gl->gl_req_gh != gh &&
- !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
- !list_empty(&gh->gh_list)) {
- list_del_init(&gh->gh_list);
- gh->gh_error = GLR_TRYFAILED;
- run_queue(gl);
- spin_unlock(&gl->gl_spin);
- return gh->gh_error;
- }
- spin_unlock(&gl->gl_spin);
- }
-
- if (gh->gh_flags & LM_FLAG_PRIORITY)
- do_cancels(gh);
-
wait_on_holder(gh);
- if (gh->gh_error)
- return gh->gh_error;
-
- gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
- gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
- gh->gh_flags));
-
- if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-
- if (glops->go_lock) {
- gh->gh_error = glops->go_lock(gh);
- if (gh->gh_error) {
- spin_lock(&gl->gl_spin);
- list_del_init(&gh->gh_list);
- spin_unlock(&gl->gl_spin);
- }
- }
-
- spin_lock(&gl->gl_spin);
- gl->gl_req_gh = NULL;
- clear_bit(GLF_LOCK, &gl->gl_flags);
- run_queue(gl);
- spin_unlock(&gl->gl_spin);
- }
-
return gh->gh_error;
}
-static inline struct gfs2_holder *
-find_holder_by_owner(struct list_head *head, struct pid *pid)
-{
- struct gfs2_holder *gh;
-
- list_for_each_entry(gh, head, gh_list) {
- if (gh->gh_owner_pid == pid)
- return gh;
- }
-
- return NULL;
-}
-
-static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
- if (gi) {
+ if (seq) {
+ struct gfs2_glock_iter *gi = seq->private;
vsprintf(gi->string, fmt, args);
- seq_printf(gi->seq, gi->string);
- }
- else
+ seq_printf(seq, gi->string);
+ } else {
+ printk(KERN_ERR " ");
vprintk(fmt, args);
+ }
va_end(args);
}
@@ -1104,50 +870,76 @@ static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
* add_to_queue - Add a holder to the wait queue (but look for recursion)
* @gh: the holder structure to add
*
+ * Eventually we should move the recursive locking trap to a
+ * debugging option or something like that. This is the fast
+ * path and needs to have the minimum number of distractions.
+ *
*/
-static void add_to_queue(struct gfs2_holder *gh)
+static inline void add_to_queue(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_holder *existing;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct list_head *insert_pt = NULL;
+ struct gfs2_holder *gh2;
+ int try_lock = 0;
BUG_ON(gh->gh_owner_pid == NULL);
if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
BUG();
- if (!(gh->gh_flags & GL_FLOCK)) {
- existing = find_holder_by_owner(&gl->gl_holders,
- gh->gh_owner_pid);
- if (existing) {
- print_symbol(KERN_WARNING "original: %s\n",
- existing->gh_ip);
- printk(KERN_INFO "pid : %d\n",
- pid_nr(existing->gh_owner_pid));
- printk(KERN_INFO "lock type : %d lock state : %d\n",
- existing->gh_gl->gl_name.ln_type,
- existing->gh_gl->gl_state);
- print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
- printk(KERN_INFO "pid : %d\n",
- pid_nr(gh->gh_owner_pid));
- printk(KERN_INFO "lock type : %d lock state : %d\n",
- gl->gl_name.ln_type, gl->gl_state);
- BUG();
- }
-
- existing = find_holder_by_owner(&gl->gl_waiters3,
- gh->gh_owner_pid);
- if (existing) {
- print_symbol(KERN_WARNING "original: %s\n",
- existing->gh_ip);
- print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
- BUG();
+ if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+ if (test_bit(GLF_LOCK, &gl->gl_flags))
+ try_lock = 1;
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+ goto fail;
+ }
+
+ list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
+ if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
+ (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
+ goto trap_recursive;
+ if (try_lock &&
+ !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
+ !may_grant(gl, gh)) {
+fail:
+ gh->gh_error = GLR_TRYFAILED;
+ gfs2_holder_wake(gh);
+ return;
}
+ if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
+ continue;
+ if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
+ insert_pt = &gh2->gh_list;
+ }
+ if (likely(insert_pt == NULL)) {
+ list_add_tail(&gh->gh_list, &gl->gl_holders);
+ if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
+ goto do_cancel;
+ return;
+ }
+ list_add_tail(&gh->gh_list, insert_pt);
+do_cancel:
+ gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+ if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
+ spin_unlock(&gl->gl_spin);
+ if (sdp->sd_lockstruct.ls_ops->lm_cancel)
+ sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
+ spin_lock(&gl->gl_spin);
}
+ return;
- if (gh->gh_flags & LM_FLAG_PRIORITY)
- list_add(&gh->gh_list, &gl->gl_waiters3);
- else
- list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+trap_recursive:
+ print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip);
+ printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
+ printk(KERN_ERR "lock type: %d req lock state : %d\n",
+ gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
+ print_symbol(KERN_ERR "new: %s\n", gh->gh_ip);
+ printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
+ printk(KERN_ERR "lock type: %d req lock state : %d\n",
+ gh->gh_gl->gl_name.ln_type, gh->gh_state);
+ __dump_glock(NULL, gl);
+ BUG();
}
/**
@@ -1165,24 +957,16 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
struct gfs2_sbd *sdp = gl->gl_sbd;
int error = 0;
-restart:
- if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
- set_bit(HIF_ABORTED, &gh->gh_iflags);
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
return -EIO;
- }
spin_lock(&gl->gl_spin);
add_to_queue(gh);
- run_queue(gl);
+ run_queue(gl, 1);
spin_unlock(&gl->gl_spin);
- if (!(gh->gh_flags & GL_ASYNC)) {
- error = glock_wait_internal(gh);
- if (error == GLR_CANCELED) {
- msleep(100);
- goto restart;
- }
- }
+ if (!(gh->gh_flags & GL_ASYNC))
+ error = gfs2_glock_wait(gh);
return error;
}
@@ -1196,48 +980,7 @@ restart:
int gfs2_glock_poll(struct gfs2_holder *gh)
{
- struct gfs2_glock *gl = gh->gh_gl;
- int ready = 0;
-
- spin_lock(&gl->gl_spin);
-
- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
- ready = 1;
- else if (list_empty(&gh->gh_list)) {
- if (gh->gh_error == GLR_CANCELED) {
- spin_unlock(&gl->gl_spin);
- msleep(100);
- if (gfs2_glock_nq(gh))
- return 1;
- return 0;
- } else
- ready = 1;
- }
-
- spin_unlock(&gl->gl_spin);
-
- return ready;
-}
-
-/**
- * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
- * @gh: the holder structure
- *
- * Returns: 0, GLR_TRYFAILED, or errno on failure
- */
-
-int gfs2_glock_wait(struct gfs2_holder *gh)
-{
- int error;
-
- error = glock_wait_internal(gh);
- if (error == GLR_CANCELED) {
- msleep(100);
- gh->gh_flags &= ~GL_ASYNC;
- error = gfs2_glock_nq(gh);
- }
-
- return error;
+ return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
}
/**
@@ -1251,26 +994,30 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
struct gfs2_glock *gl = gh->gh_gl;
const struct gfs2_glock_operations *glops = gl->gl_ops;
unsigned delay = 0;
+ int fast_path = 0;
+ spin_lock(&gl->gl_spin);
if (gh->gh_flags & GL_NOCACHE)
handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
- gfs2_glmutex_lock(gl);
-
- spin_lock(&gl->gl_spin);
list_del_init(&gh->gh_list);
-
- if (list_empty(&gl->gl_holders)) {
+ if (find_first_holder(gl) == NULL) {
if (glops->go_unlock) {
+ GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
spin_unlock(&gl->gl_spin);
glops->go_unlock(gh);
spin_lock(&gl->gl_spin);
+ clear_bit(GLF_LOCK, &gl->gl_flags);
}
gl->gl_stamp = jiffies;
+ if (list_empty(&gl->gl_holders) &&
+ !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+ !test_bit(GLF_DEMOTE, &gl->gl_flags))
+ fast_path = 1;
}
-
- clear_bit(GLF_LOCK, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
+ if (likely(fast_path))
+ return;
gfs2_glock_hold(gl);
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1454,6 +1201,8 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
{
int error = -EIO;
+ if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
+ return 0;
if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
return error;
@@ -1469,20 +1218,14 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
{
int error;
- gfs2_glmutex_lock(gl);
-
if (!atomic_read(&gl->gl_lvb_count)) {
error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
- if (error) {
- gfs2_glmutex_unlock(gl);
+ if (error)
return error;
- }
gfs2_glock_hold(gl);
}
atomic_inc(&gl->gl_lvb_count);
- gfs2_glmutex_unlock(gl);
-
return 0;
}
@@ -1497,17 +1240,13 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl)
struct gfs2_sbd *sdp = gl->gl_sbd;
gfs2_glock_hold(gl);
- gfs2_glmutex_lock(gl);
-
gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
if (atomic_dec_and_test(&gl->gl_lvb_count)) {
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
gl->gl_lvb = NULL;
gfs2_glock_put(gl);
}
-
- gfs2_glmutex_unlock(gl);
gfs2_glock_put(gl);
}
@@ -1527,7 +1266,9 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
if (time_before(now, holdtime))
delay = holdtime - now;
+ spin_lock(&gl->gl_spin);
handle_callback(gl, state, 1, delay);
+ spin_unlock(&gl->gl_spin);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
gfs2_glock_put(gl);
}
@@ -1568,7 +1309,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
gl = gfs2_glock_find(sdp, &async->lc_name);
if (gfs2_assert_warn(sdp, gl))
return;
- xmote_bh(gl, async->lc_ret);
+ gl->gl_reply = async->lc_ret;
+ set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
up_read(&gfs2_umount_flush_sem);
@@ -1581,11 +1323,6 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
wake_up_process(sdp->sd_recoverd_process);
return;
- case LM_CB_DROPLOCKS:
- gfs2_gl_hash_clear(sdp, NO_WAIT);
- gfs2_quota_scan(sdp);
- return;
-
default:
gfs2_assert_warn(sdp, 0);
return;
@@ -1646,6 +1383,7 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
{
struct gfs2_glock *gl;
+ int done_callback = 0;
spin_lock(&sdp->sd_reclaim_lock);
if (list_empty(&sdp->sd_reclaim_list)) {
@@ -1660,14 +1398,16 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
atomic_dec(&sdp->sd_reclaim_count);
atomic_inc(&sdp->sd_reclaimed);
- if (gfs2_glmutex_trylock(gl)) {
- if (list_empty(&gl->gl_holders) &&
- gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
- handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
- gfs2_glmutex_unlock(gl);
+ spin_lock(&gl->gl_spin);
+ if (find_first_holder(gl) == NULL &&
+ gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
+ handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+ done_callback = 1;
}
-
- gfs2_glock_put(gl);
+ spin_unlock(&gl->gl_spin);
+ if (!done_callback ||
+ queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
}
/**
@@ -1724,18 +1464,14 @@ static void scan_glock(struct gfs2_glock *gl)
{
if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
return;
+ if (test_bit(GLF_LOCK, &gl->gl_flags))
+ return;
- if (gfs2_glmutex_trylock(gl)) {
- if (list_empty(&gl->gl_holders) &&
- gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
- goto out_schedule;
- gfs2_glmutex_unlock(gl);
- }
- return;
-
-out_schedule:
- gfs2_glmutex_unlock(gl);
- gfs2_glock_schedule_for_reclaim(gl);
+ spin_lock(&gl->gl_spin);
+ if (find_first_holder(gl) == NULL &&
+ gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+ gfs2_glock_schedule_for_reclaim(gl);
+ spin_unlock(&gl->gl_spin);
}
/**
@@ -1760,12 +1496,13 @@ static void clear_glock(struct gfs2_glock *gl)
spin_unlock(&sdp->sd_reclaim_lock);
}
- if (gfs2_glmutex_trylock(gl)) {
- if (list_empty(&gl->gl_holders) &&
- gl->gl_state != LM_ST_UNLOCKED)
- handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
- gfs2_glmutex_unlock(gl);
- }
+ spin_lock(&gl->gl_spin);
+ if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
+ handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_hold(gl);
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
}
/**
@@ -1773,11 +1510,10 @@ static void clear_glock(struct gfs2_glock *gl)
* @sdp: the filesystem
* @wait: wait until it's all gone
*
- * Called when unmounting the filesystem, or when inter-node lock manager
- * requests DROPLOCKS because it is running out of capacity.
+ * Called when unmounting the filesystem.
*/
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
{
unsigned long t;
unsigned int x;
@@ -1792,7 +1528,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
cont = 1;
}
- if (!wait || !cont)
+ if (!cont)
break;
if (time_after_eq(jiffies,
@@ -1810,180 +1546,164 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
}
}
-/*
- * Diagnostic routines to help debug distributed deadlock
- */
-
-static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
- unsigned long address)
+static const char *state2str(unsigned state)
{
- char buffer[KSYM_SYMBOL_LEN];
-
- sprint_symbol(buffer, address);
- print_dbg(gi, fmt, buffer);
+ switch(state) {
+ case LM_ST_UNLOCKED:
+ return "UN";
+ case LM_ST_SHARED:
+ return "SH";
+ case LM_ST_DEFERRED:
+ return "DF";
+ case LM_ST_EXCLUSIVE:
+ return "EX";
+ }
+ return "??";
+}
+
+static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+{
+ char *p = buf;
+ if (flags & LM_FLAG_TRY)
+ *p++ = 't';
+ if (flags & LM_FLAG_TRY_1CB)
+ *p++ = 'T';
+ if (flags & LM_FLAG_NOEXP)
+ *p++ = 'e';
+ if (flags & LM_FLAG_ANY)
+ *p++ = 'a';
+ if (flags & LM_FLAG_PRIORITY)
+ *p++ = 'p';
+ if (flags & GL_ASYNC)
+ *p++ = 'a';
+ if (flags & GL_EXACT)
+ *p++ = 'E';
+ if (flags & GL_ATIME)
+ *p++ = 'a';
+ if (flags & GL_NOCACHE)
+ *p++ = 'c';
+ if (test_bit(HIF_HOLDER, &iflags))
+ *p++ = 'H';
+ if (test_bit(HIF_WAIT, &iflags))
+ *p++ = 'W';
+ if (test_bit(HIF_FIRST, &iflags))
+ *p++ = 'F';
+ *p = 0;
+ return buf;
}
/**
* dump_holder - print information about a glock holder
- * @str: a string naming the type of holder
+ * @seq: the seq_file struct
* @gh: the glock holder
*
* Returns: 0 on success, -ENOBUFS when we run out of space
*/
-static int dump_holder(struct glock_iter *gi, char *str,
- struct gfs2_holder *gh)
+static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
{
- unsigned int x;
- struct task_struct *gh_owner;
+ struct task_struct *gh_owner = NULL;
+ char buffer[KSYM_SYMBOL_LEN];
+ char flags_buf[32];
- print_dbg(gi, " %s\n", str);
- if (gh->gh_owner_pid) {
- print_dbg(gi, " owner = %ld ",
- (long)pid_nr(gh->gh_owner_pid));
+ sprint_symbol(buffer, gh->gh_ip);
+ if (gh->gh_owner_pid)
gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
- if (gh_owner)
- print_dbg(gi, "(%s)\n", gh_owner->comm);
- else
- print_dbg(gi, "(ended)\n");
- } else
- print_dbg(gi, " owner = -1\n");
- print_dbg(gi, " gh_state = %u\n", gh->gh_state);
- print_dbg(gi, " gh_flags =");
- for (x = 0; x < 32; x++)
- if (gh->gh_flags & (1 << x))
- print_dbg(gi, " %u", x);
- print_dbg(gi, " \n");
- print_dbg(gi, " error = %d\n", gh->gh_error);
- print_dbg(gi, " gh_iflags =");
- for (x = 0; x < 32; x++)
- if (test_bit(x, &gh->gh_iflags))
- print_dbg(gi, " %u", x);
- print_dbg(gi, " \n");
- gfs2_print_symbol(gi, " initialized at: %s\n", gh->gh_ip);
-
+ gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
+ state2str(gh->gh_state),
+ hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+ gh->gh_error,
+ gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+ gh_owner ? gh_owner->comm : "(ended)", buffer);
return 0;
}
-/**
- * dump_inode - print information about an inode
- * @ip: the inode
- *
- * Returns: 0 on success, -ENOBUFS when we run out of space
- */
-
-static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
-{
- unsigned int x;
-
- print_dbg(gi, " Inode:\n");
- print_dbg(gi, " num = %llu/%llu\n",
- (unsigned long long)ip->i_no_formal_ino,
- (unsigned long long)ip->i_no_addr);
- print_dbg(gi, " type = %u\n", IF2DT(ip->i_inode.i_mode));
- print_dbg(gi, " i_flags =");
- for (x = 0; x < 32; x++)
- if (test_bit(x, &ip->i_flags))
- print_dbg(gi, " %u", x);
- print_dbg(gi, " \n");
- return 0;
+static const char *gflags2str(char *buf, const unsigned long *gflags)
+{
+ char *p = buf;
+ if (test_bit(GLF_LOCK, gflags))
+ *p++ = 'l';
+ if (test_bit(GLF_STICKY, gflags))
+ *p++ = 's';
+ if (test_bit(GLF_DEMOTE, gflags))
+ *p++ = 'D';
+ if (test_bit(GLF_PENDING_DEMOTE, gflags))
+ *p++ = 'd';
+ if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags))
+ *p++ = 'p';
+ if (test_bit(GLF_DIRTY, gflags))
+ *p++ = 'y';
+ if (test_bit(GLF_LFLUSH, gflags))
+ *p++ = 'f';
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
+ *p++ = 'i';
+ if (test_bit(GLF_REPLY_PENDING, gflags))
+ *p++ = 'r';
+ *p = 0;
+ return buf;
}
/**
- * dump_glock - print information about a glock
+ * __dump_glock - print information about a glock
+ * @seq: The seq_file struct
* @gl: the glock
- * @count: where we are in the buffer
+ *
+ * The file format is as follows:
+ * One line per object, capital letters are used to indicate objects
+ * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented,
+ * other objects are indented by a single space and follow the glock to
+ * which they are related. Fields are indicated by lower case letters
+ * followed by a colon and the field value, except for strings which are in
+ * [] so that its possible to see if they are composed of spaces for
+ * example. The field's are n = number (id of the object), f = flags,
+ * t = type, s = state, r = refcount, e = error, p = pid.
*
* Returns: 0 on success, -ENOBUFS when we run out of space
*/
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
{
- struct gfs2_holder *gh;
- unsigned int x;
- int error = -ENOBUFS;
- struct task_struct *gl_owner;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ unsigned long long dtime;
+ const struct gfs2_holder *gh;
+ char gflags_buf[32];
+ int error = 0;
- spin_lock(&gl->gl_spin);
+ dtime = jiffies - gl->gl_demote_time;
+ dtime *= 1000000/HZ; /* demote time in uSec */
+ if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
+ dtime = 0;
+ gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n",
+ state2str(gl->gl_state),
+ gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number,
+ gflags2str(gflags_buf, &gl->gl_flags),
+ state2str(gl->gl_target),
+ state2str(gl->gl_demote_state), dtime,
+ atomic_read(&gl->gl_lvb_count),
+ atomic_read(&gl->gl_ail_count),
+ atomic_read(&gl->gl_ref));
- print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type,
- (unsigned long long)gl->gl_name.ln_number);
- print_dbg(gi, " gl_flags =");
- for (x = 0; x < 32; x++) {
- if (test_bit(x, &gl->gl_flags))
- print_dbg(gi, " %u", x);
- }
- if (!test_bit(GLF_LOCK, &gl->gl_flags))
- print_dbg(gi, " (unlocked)");
- print_dbg(gi, " \n");
- print_dbg(gi, " gl_ref = %d\n", atomic_read(&gl->gl_ref));
- print_dbg(gi, " gl_state = %u\n", gl->gl_state);
- if (gl->gl_owner_pid) {
- gl_owner = pid_task(gl->gl_owner_pid, PIDTYPE_PID);
- if (gl_owner)
- print_dbg(gi, " gl_owner = pid %d (%s)\n",
- pid_nr(gl->gl_owner_pid), gl_owner->comm);
- else
- print_dbg(gi, " gl_owner = %d (ended)\n",
- pid_nr(gl->gl_owner_pid));
- } else
- print_dbg(gi, " gl_owner = -1\n");
- print_dbg(gi, " gl_ip = %lu\n", gl->gl_ip);
- print_dbg(gi, " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
- print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
- print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no");
- print_dbg(gi, " reclaim = %s\n",
- (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
- if (gl->gl_aspace)
- print_dbg(gi, " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
- gl->gl_aspace->i_mapping->nrpages);
- else
- print_dbg(gi, " aspace = no\n");
- print_dbg(gi, " ail = %d\n", atomic_read(&gl->gl_ail_count));
- if (gl->gl_req_gh) {
- error = dump_holder(gi, "Request", gl->gl_req_gh);
- if (error)
- goto out;
- }
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
- error = dump_holder(gi, "Holder", gh);
+ error = dump_holder(seq, gh);
if (error)
goto out;
}
- list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
- error = dump_holder(gi, "Waiter1", gh);
- if (error)
- goto out;
- }
- list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
- error = dump_holder(gi, "Waiter3", gh);
- if (error)
- goto out;
- }
- if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
- print_dbg(gi, " Demotion req to state %u (%llu uS ago)\n",
- gl->gl_demote_state, (unsigned long long)
- (jiffies - gl->gl_demote_time)*(1000000/HZ));
- }
- if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
- if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
- list_empty(&gl->gl_holders)) {
- error = dump_inode(gi, gl->gl_object);
- if (error)
- goto out;
- } else {
- error = -ENOBUFS;
- print_dbg(gi, " Inode: busy\n");
- }
- }
-
- error = 0;
-
+ if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
+ error = glops->go_dump(seq, gl);
out:
- spin_unlock(&gl->gl_spin);
return error;
}
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+{
+ int ret;
+ spin_lock(&gl->gl_spin);
+ ret = __dump_glock(seq, gl);
+ spin_unlock(&gl->gl_spin);
+ return ret;
+}
+
/**
* gfs2_dump_lockstate - print out the current lockstate
* @sdp: the filesystem
@@ -2086,7 +1806,7 @@ void gfs2_glock_exit(void)
module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
-static int gfs2_glock_iter_next(struct glock_iter *gi)
+static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
{
struct gfs2_glock *gl;
@@ -2104,7 +1824,7 @@ restart:
gfs2_glock_put(gl);
if (gl && gi->gl == NULL)
gi->hash++;
- while(gi->gl == NULL) {
+ while (gi->gl == NULL) {
if (gi->hash >= GFS2_GL_HASH_SIZE)
return 1;
read_lock(gl_lock_addr(gi->hash));
@@ -2122,58 +1842,34 @@ restart:
return 0;
}
-static void gfs2_glock_iter_free(struct glock_iter *gi)
+static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
{
if (gi->gl)
gfs2_glock_put(gi->gl);
- kfree(gi);
-}
-
-static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
-{
- struct glock_iter *gi;
-
- gi = kmalloc(sizeof (*gi), GFP_KERNEL);
- if (!gi)
- return NULL;
-
- gi->sdp = sdp;
- gi->hash = 0;
- gi->seq = NULL;
gi->gl = NULL;
- memset(gi->string, 0, sizeof(gi->string));
-
- if (gfs2_glock_iter_next(gi)) {
- gfs2_glock_iter_free(gi);
- return NULL;
- }
-
- return gi;
}
-static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
+static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct glock_iter *gi;
+ struct gfs2_glock_iter *gi = seq->private;
loff_t n = *pos;
- gi = gfs2_glock_iter_init(file->private);
- if (!gi)
- return NULL;
+ gi->hash = 0;
- while(n--) {
+ do {
if (gfs2_glock_iter_next(gi)) {
gfs2_glock_iter_free(gi);
return NULL;
}
- }
+ } while (n--);
- return gi;
+ return gi->gl;
}
-static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
+static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
loff_t *pos)
{
- struct glock_iter *gi = iter_ptr;
+ struct gfs2_glock_iter *gi = seq->private;
(*pos)++;
@@ -2182,24 +1878,18 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
return NULL;
}
- return gi;
+ return gi->gl;
}
-static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
+static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
{
- struct glock_iter *gi = iter_ptr;
- if (gi)
- gfs2_glock_iter_free(gi);
+ struct gfs2_glock_iter *gi = seq->private;
+ gfs2_glock_iter_free(gi);
}
-static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
+static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
{
- struct glock_iter *gi = iter_ptr;
-
- gi->seq = file;
- dump_glock(gi, gi->gl);
-
- return 0;
+ return dump_glock(seq, iter_ptr);
}
static const struct seq_operations gfs2_glock_seq_ops = {
@@ -2211,17 +1901,14 @@ static const struct seq_operations gfs2_glock_seq_ops = {
static int gfs2_debugfs_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int ret;
-
- ret = seq_open(file, &gfs2_glock_seq_ops);
- if (ret)
- return ret;
-
- seq = file->private_data;
- seq->private = inode->i_private;
-
- return 0;
+ int ret = seq_open_private(file, &gfs2_glock_seq_ops,
+ sizeof(struct gfs2_glock_iter));
+ if (ret == 0) {
+ struct seq_file *seq = file->private_data;
+ struct gfs2_glock_iter *gi = seq->private;
+ gi->sdp = inode->i_private;
+ }
+ return ret;
}
static const struct file_operations gfs2_debug_fops = {
@@ -2229,7 +1916,7 @@ static const struct file_operations gfs2_debug_fops = {
.open = gfs2_debugfs_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release
+ .release = seq_release_private,
};
int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index cdad3e6f8150..971d92af70fc 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -26,11 +26,8 @@
#define GL_SKIP 0x00000100
#define GL_ATIME 0x00000200
#define GL_NOCACHE 0x00000400
-#define GL_FLOCK 0x00000800
-#define GL_NOCANCEL 0x00001000
#define GLR_TRYFAILED 13
-#define GLR_CANCELED 14
static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
{
@@ -41,6 +38,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
spin_lock(&gl->gl_spin);
pid = task_pid(current);
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+ break;
if (gh->gh_owner_pid == pid)
goto out;
}
@@ -70,7 +69,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
{
int ret;
spin_lock(&gl->gl_spin);
- ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3);
+ ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
return ret;
}
@@ -98,6 +97,7 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
/**
* gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
@@ -130,10 +130,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
void gfs2_lvb_unhold(struct gfs2_glock *gl);
void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
-
void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
int __init gfs2_glock_init(void);
void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 07d84d16cda4..c6c318c2a0f6 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
#include <linux/buffer_head.h>
#include <linux/gfs2_ondisk.h>
#include <linux/lm_interface.h>
+#include <linux/bio.h>
#include "gfs2.h"
#include "incore.h"
@@ -172,26 +173,6 @@ static void inode_go_sync(struct gfs2_glock *gl)
}
/**
- * inode_go_xmote_bh - After promoting/demoting a glock
- * @gl: the glock
- *
- */
-
-static void inode_go_xmote_bh(struct gfs2_glock *gl)
-{
- struct gfs2_holder *gh = gl->gl_req_gh;
- struct buffer_head *bh;
- int error;
-
- if (gl->gl_state != LM_ST_UNLOCKED &&
- (!gh || !(gh->gh_flags & GL_SKIP))) {
- error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
- if (!error)
- brelse(bh);
- }
-}
-
-/**
* inode_go_inval - prepare a inode glock to be released
* @gl: the glock
* @flags:
@@ -267,6 +248,26 @@ static int inode_go_lock(struct gfs2_holder *gh)
}
/**
+ * inode_go_dump - print information about an inode
+ * @seq: The iterator
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+ const struct gfs2_inode *ip = gl->gl_object;
+ if (ip == NULL)
+ return 0;
+ gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+ (unsigned long long)ip->i_no_formal_ino,
+ (unsigned long long)ip->i_no_addr,
+ IF2DT(ip->i_inode.i_mode), ip->i_flags);
+ return 0;
+}
+
+/**
* rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
* @gl: the glock
*
@@ -306,6 +307,22 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
}
/**
+ * rgrp_go_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+
+static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+ const struct gfs2_rgrpd *rgd = gl->gl_object;
+ if (rgd == NULL)
+ return 0;
+ gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+ return 0;
+}
+
+/**
* trans_go_sync - promote/demote the transaction glock
* @gl: the glock
* @state: the requested state
@@ -330,7 +347,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
*
*/
-static void trans_go_xmote_bh(struct gfs2_glock *gl)
+static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
@@ -338,8 +355,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
struct gfs2_log_header_host head;
int error;
- if (gl->gl_state != LM_ST_UNLOCKED &&
- test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+ if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -354,6 +370,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
gfs2_log_pointers_init(sdp, head.lh_blkno);
}
}
+ return 0;
}
/**
@@ -375,12 +392,12 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
const struct gfs2_glock_operations gfs2_inode_glops = {
.go_xmote_th = inode_go_sync,
- .go_xmote_bh = inode_go_xmote_bh,
.go_inval = inode_go_inval,
.go_demote_ok = inode_go_demote_ok,
.go_lock = inode_go_lock,
+ .go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
- .go_min_hold_time = HZ / 10,
+ .go_min_hold_time = HZ / 5,
};
const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -389,8 +406,9 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
.go_demote_ok = rgrp_go_demote_ok,
.go_lock = rgrp_go_lock,
.go_unlock = rgrp_go_unlock,
+ .go_dump = rgrp_go_dump,
.go_type = LM_TYPE_RGRP,
- .go_min_hold_time = HZ / 10,
+ .go_min_hold_time = HZ / 5,
};
const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eabe5eac41da..448697a5c462 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -77,7 +77,6 @@ struct gfs2_rgrp_host {
struct gfs2_rgrpd {
struct list_head rd_list; /* Link with superblock */
struct list_head rd_list_mru;
- struct list_head rd_recent; /* Recently used rgrps */
struct gfs2_glock *rd_gl; /* Glock for this rgrp */
u64 rd_addr; /* grp block disk address */
u64 rd_data0; /* first data location */
@@ -128,20 +127,20 @@ struct gfs2_bufdata {
struct gfs2_glock_operations {
void (*go_xmote_th) (struct gfs2_glock *gl);
- void (*go_xmote_bh) (struct gfs2_glock *gl);
+ int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
void (*go_inval) (struct gfs2_glock *gl, int flags);
int (*go_demote_ok) (struct gfs2_glock *gl);
int (*go_lock) (struct gfs2_holder *gh);
void (*go_unlock) (struct gfs2_holder *gh);
+ int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
const int go_type;
const unsigned long go_min_hold_time;
};
enum {
/* States */
- HIF_HOLDER = 6,
+ HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
HIF_FIRST = 7,
- HIF_ABORTED = 9,
HIF_WAIT = 10,
};
@@ -154,20 +153,20 @@ struct gfs2_holder {
unsigned gh_flags;
int gh_error;
- unsigned long gh_iflags;
+ unsigned long gh_iflags; /* HIF_... */
unsigned long gh_ip;
};
enum {
- GLF_LOCK = 1,
- GLF_STICKY = 2,
- GLF_DEMOTE = 3,
- GLF_PENDING_DEMOTE = 4,
- GLF_DIRTY = 5,
- GLF_DEMOTE_IN_PROGRESS = 6,
- GLF_LFLUSH = 7,
- GLF_WAITERS2 = 8,
- GLF_CONV_DEADLK = 9,
+ GLF_LOCK = 1,
+ GLF_STICKY = 2,
+ GLF_DEMOTE = 3,
+ GLF_PENDING_DEMOTE = 4,
+ GLF_DEMOTE_IN_PROGRESS = 5,
+ GLF_DIRTY = 6,
+ GLF_LFLUSH = 7,
+ GLF_INVALIDATE_IN_PROGRESS = 8,
+ GLF_REPLY_PENDING = 9,
};
struct gfs2_glock {
@@ -179,19 +178,14 @@ struct gfs2_glock {
spinlock_t gl_spin;
unsigned int gl_state;
+ unsigned int gl_target;
+ unsigned int gl_reply;
unsigned int gl_hash;
unsigned int gl_demote_state; /* state requested by remote node */
unsigned long gl_demote_time; /* time of first demote request */
- struct pid *gl_owner_pid;
- unsigned long gl_ip;
struct list_head gl_holders;
- struct list_head gl_waiters1; /* HIF_MUTEX */
- struct list_head gl_waiters3; /* HIF_PROMOTE */
const struct gfs2_glock_operations *gl_ops;
-
- struct gfs2_holder *gl_req_gh;
-
void *gl_lock;
char *gl_lvb;
atomic_t gl_lvb_count;
@@ -427,7 +421,6 @@ struct gfs2_tune {
unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
unsigned int gt_atime_quantum; /* Min secs between atime updates */
unsigned int gt_new_files_jdata;
- unsigned int gt_new_files_directio;
unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
unsigned int gt_stall_secs; /* Detects trouble! */
unsigned int gt_complain_secs;
@@ -534,7 +527,6 @@ struct gfs2_sbd {
struct mutex sd_rindex_mutex;
struct list_head sd_rindex_list;
struct list_head sd_rindex_mru_list;
- struct list_head sd_rindex_recent_list;
struct gfs2_rgrpd *sd_rindex_forward;
unsigned int sd_rgrps;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 09453d057e41..6da0ab355b8a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -504,7 +504,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
}
if (!is_root) {
- error = permission(dir, MAY_EXEC, NULL);
+ error = gfs2_permission(dir, MAY_EXEC);
if (error)
goto out;
}
@@ -667,7 +667,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
{
int error;
- error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+ error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
if (error)
return error;
@@ -789,13 +789,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
gfs2_tune_get(sdp, gt_new_files_jdata))
di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
- if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
- gfs2_tune_get(sdp, gt_new_files_directio))
- di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
} else if (S_ISDIR(mode)) {
di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
- GFS2_DIF_INHERIT_DIRECTIO);
- di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
GFS2_DIF_INHERIT_JDATA);
}
@@ -1134,7 +1129,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
if (IS_APPEND(&dip->i_inode))
return -EPERM;
- error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+ error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
if (error)
return error;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 580da454b38f..6074c2506f75 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -72,7 +72,6 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
}
-void gfs2_inode_attr_in(struct gfs2_inode *ip);
void gfs2_set_iop(struct inode *inode);
struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
u64 no_addr, u64 no_formal_ino,
@@ -91,6 +90,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
struct gfs2_inode *ip);
int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
const struct gfs2_inode *ip);
+int gfs2_permission(struct inode *inode, int mask);
int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
int gfs2_glock_nq_atime(struct gfs2_holder *gh);
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
index 663fee728783..523243a13a21 100644
--- a/fs/gfs2/locking.c
+++ b/fs/gfs2/locking.c
@@ -23,12 +23,54 @@ struct lmh_wrapper {
const struct lm_lockops *lw_ops;
};
+static int nolock_mount(char *table_name, char *host_data,
+ lm_callback_t cb, void *cb_data,
+ unsigned int min_lvb_size, int flags,
+ struct lm_lockstruct *lockstruct,
+ struct kobject *fskobj);
+
/* List of registered low-level locking protocols. A file system selects one
of them by name at mount time, e.g. lock_nolock, lock_dlm. */
+static const struct lm_lockops nolock_ops = {
+ .lm_proto_name = "lock_nolock",
+ .lm_mount = nolock_mount,
+};
+
+static struct lmh_wrapper nolock_proto = {
+ .lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
+ .lw_ops = &nolock_ops,
+};
+
static LIST_HEAD(lmh_list);
static DEFINE_MUTEX(lmh_lock);
+static int nolock_mount(char *table_name, char *host_data,
+ lm_callback_t cb, void *cb_data,
+ unsigned int min_lvb_size, int flags,
+ struct lm_lockstruct *lockstruct,
+ struct kobject *fskobj)
+{
+ char *c;
+ unsigned int jid;
+
+ c = strstr(host_data, "jid=");
+ if (!c)
+ jid = 0;
+ else {
+ c += 4;
+ sscanf(c, "%u", &jid);
+ }
+
+ lockstruct->ls_jid = jid;
+ lockstruct->ls_first = 1;
+ lockstruct->ls_lvb_size = min_lvb_size;
+ lockstruct->ls_ops = &nolock_ops;
+ lockstruct->ls_flags = LM_LSFLAG_LOCAL;
+
+ return 0;
+}
+
/**
* gfs2_register_lockproto - Register a low-level locking protocol
* @proto: the protocol definition
@@ -116,9 +158,13 @@ int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
int try = 0;
int error, found;
+
retry:
mutex_lock(&lmh_lock);
+ if (list_empty(&nolock_proto.lw_list))
+ list_add(&nolock_proto.lw_list, &lmh_list);
+
found = 0;
list_for_each_entry(lw, &lmh_list, lw_list) {
if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
@@ -139,7 +185,8 @@ retry:
goto out;
}
- if (!try_module_get(lw->lw_ops->lm_owner)) {
+ if (lw->lw_ops->lm_owner &&
+ !try_module_get(lw->lw_ops->lm_owner)) {
try = 0;
mutex_unlock(&lmh_lock);
msleep(1000);
@@ -158,7 +205,8 @@ out:
void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
{
mutex_lock(&lmh_lock);
- lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
+ if (lockstruct->ls_ops->lm_unmount)
+ lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
if (lockstruct->ls_ops->lm_owner)
module_put(lockstruct->ls_ops->lm_owner);
mutex_unlock(&lmh_lock);
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index cf7ea8abec87..2482c9047505 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -11,46 +11,60 @@
static char junk_lvb[GDLM_LVB_SIZE];
-static void queue_complete(struct gdlm_lock *lp)
+
+/* convert dlm lock-mode to gfs lock-state */
+
+static s16 gdlm_make_lmstate(s16 dlmmode)
{
- struct gdlm_ls *ls = lp->ls;
+ switch (dlmmode) {
+ case DLM_LOCK_IV:
+ case DLM_LOCK_NL:
+ return LM_ST_UNLOCKED;
+ case DLM_LOCK_EX:
+ return LM_ST_EXCLUSIVE;
+ case DLM_LOCK_CW:
+ return LM_ST_DEFERRED;
+ case DLM_LOCK_PR:
+ return LM_ST_SHARED;
+ }
+ gdlm_assert(0, "unknown DLM mode %d", dlmmode);
+ return -1;
+}
- clear_bit(LFL_ACTIVE, &lp->flags);
+/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+ thread gets to it. */
+
+static void queue_submit(struct gdlm_lock *lp)
+{
+ struct gdlm_ls *ls = lp->ls;
spin_lock(&ls->async_lock);
- list_add_tail(&lp->clist, &ls->complete);
+ list_add_tail(&lp->delay_list, &ls->submit);
spin_unlock(&ls->async_lock);
wake_up(&ls->thread_wait);
}
-static inline void gdlm_ast(void *astarg)
+static void wake_up_ast(struct gdlm_lock *lp)
{
- queue_complete(astarg);
+ clear_bit(LFL_AST_WAIT, &lp->flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&lp->flags, LFL_AST_WAIT);
}
-static inline void gdlm_bast(void *astarg, int mode)
+static void gdlm_delete_lp(struct gdlm_lock *lp)
{
- struct gdlm_lock *lp = astarg;
struct gdlm_ls *ls = lp->ls;
- if (!mode) {
- printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
- return;
- }
-
spin_lock(&ls->async_lock);
- if (!lp->bast_mode) {
- list_add_tail(&lp->blist, &ls->blocking);
- lp->bast_mode = mode;
- } else if (lp->bast_mode < mode)
- lp->bast_mode = mode;
+ if (!list_empty(&lp->delay_list))
+ list_del_init(&lp->delay_list);
+ ls->all_locks_count--;
spin_unlock(&ls->async_lock);
- wake_up(&ls->thread_wait);
+
+ kfree(lp);
}
-void gdlm_queue_delayed(struct gdlm_lock *lp)
+static void gdlm_queue_delayed(struct gdlm_lock *lp)
{
struct gdlm_ls *ls = lp->ls;
@@ -59,6 +73,236 @@ void gdlm_queue_delayed(struct gdlm_lock *lp)
spin_unlock(&ls->async_lock);
}
+static void process_complete(struct gdlm_lock *lp)
+{
+ struct gdlm_ls *ls = lp->ls;
+ struct lm_async_cb acb;
+
+ memset(&acb, 0, sizeof(acb));
+
+ if (lp->lksb.sb_status == -DLM_ECANCEL) {
+ log_info("complete dlm cancel %x,%llx flags %lx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->flags);
+
+ lp->req = lp->cur;
+ acb.lc_ret |= LM_OUT_CANCELED;
+ if (lp->cur == DLM_LOCK_IV)
+ lp->lksb.sb_lkid = 0;
+ goto out;
+ }
+
+ if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+ if (lp->lksb.sb_status != -DLM_EUNLOCK) {
+ log_info("unlock sb_status %d %x,%llx flags %lx",
+ lp->lksb.sb_status, lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->flags);
+ return;
+ }
+
+ lp->cur = DLM_LOCK_IV;
+ lp->req = DLM_LOCK_IV;
+ lp->lksb.sb_lkid = 0;
+
+ if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
+ gdlm_delete_lp(lp);
+ return;
+ }
+ goto out;
+ }
+
+ if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
+ memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
+
+ if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
+ if (lp->req == DLM_LOCK_PR)
+ lp->req = DLM_LOCK_CW;
+ else if (lp->req == DLM_LOCK_CW)
+ lp->req = DLM_LOCK_PR;
+ }
+
+ /*
+ * A canceled lock request. The lock was just taken off the delayed
+ * list and was never even submitted to dlm.
+ */
+
+ if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
+ log_info("complete internal cancel %x,%llx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ lp->req = lp->cur;
+ acb.lc_ret |= LM_OUT_CANCELED;
+ goto out;
+ }
+
+ /*
+ * An error occured.
+ */
+
+ if (lp->lksb.sb_status) {
+ /* a "normal" error */
+ if ((lp->lksb.sb_status == -EAGAIN) &&
+ (lp->lkf & DLM_LKF_NOQUEUE)) {
+ lp->req = lp->cur;
+ if (lp->cur == DLM_LOCK_IV)
+ lp->lksb.sb_lkid = 0;
+ goto out;
+ }
+
+ /* this could only happen with cancels I think */
+ log_info("ast sb_status %d %x,%llx flags %lx",
+ lp->lksb.sb_status, lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->flags);
+ return;
+ }
+
+ /*
+ * This is an AST for an EX->EX conversion for sync_lvb from GFS.
+ */
+
+ if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
+ wake_up_ast(lp);
+ return;
+ }
+
+ /*
+ * A lock has been demoted to NL because it initially completed during
+ * BLOCK_LOCKS. Now it must be requested in the originally requested
+ * mode.
+ */
+
+ if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
+ gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+
+ lp->cur = DLM_LOCK_NL;
+ lp->req = lp->prev_req;
+ lp->prev_req = DLM_LOCK_IV;
+ lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+ set_bit(LFL_NOCACHE, &lp->flags);
+
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+ !test_bit(LFL_NOBLOCK, &lp->flags))
+ gdlm_queue_delayed(lp);
+ else
+ queue_submit(lp);
+ return;
+ }
+
+ /*
+ * A request is granted during dlm recovery. It may be granted
+ * because the locks of a failed node were cleared. In that case,
+ * there may be inconsistent data beneath this lock and we must wait
+ * for recovery to complete to use it. When gfs recovery is done this
+ * granted lock will be converted to NL and then reacquired in this
+ * granted state.
+ */
+
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+ !test_bit(LFL_NOBLOCK, &lp->flags) &&
+ lp->req != DLM_LOCK_NL) {
+
+ lp->cur = lp->req;
+ lp->prev_req = lp->req;
+ lp->req = DLM_LOCK_NL;
+ lp->lkf |= DLM_LKF_CONVERT;
+ lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+ log_debug("rereq %x,%llx id %x %d,%d",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->lksb.sb_lkid, lp->cur, lp->req);
+
+ set_bit(LFL_REREQUEST, &lp->flags);
+ queue_submit(lp);
+ return;
+ }
+
+ /*
+ * DLM demoted the lock to NL before it was granted so GFS must be
+ * told it cannot cache data for this lock.
+ */
+
+ if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
+ set_bit(LFL_NOCACHE, &lp->flags);
+
+out:
+ /*
+ * This is an internal lock_dlm lock
+ */
+
+ if (test_bit(LFL_INLOCK, &lp->flags)) {
+ clear_bit(LFL_NOBLOCK, &lp->flags);
+ lp->cur = lp->req;
+ wake_up_ast(lp);
+ return;
+ }
+
+ /*
+ * Normal completion of a lock request. Tell GFS it now has the lock.
+ */
+
+ clear_bit(LFL_NOBLOCK, &lp->flags);
+ lp->cur = lp->req;
+
+ acb.lc_name = lp->lockname;
+ acb.lc_ret |= gdlm_make_lmstate(lp->cur);
+
+ ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
+}
+
+static void gdlm_ast(void *astarg)
+{
+ struct gdlm_lock *lp = astarg;
+ clear_bit(LFL_ACTIVE, &lp->flags);
+ process_complete(lp);
+}
+
+static void process_blocking(struct gdlm_lock *lp, int bast_mode)
+{
+ struct gdlm_ls *ls = lp->ls;
+ unsigned int cb = 0;
+
+ switch (gdlm_make_lmstate(bast_mode)) {
+ case LM_ST_EXCLUSIVE:
+ cb = LM_CB_NEED_E;
+ break;
+ case LM_ST_DEFERRED:
+ cb = LM_CB_NEED_D;
+ break;
+ case LM_ST_SHARED:
+ cb = LM_CB_NEED_S;
+ break;
+ default:
+ gdlm_assert(0, "unknown bast mode %u", bast_mode);
+ }
+
+ ls->fscb(ls->sdp, cb, &lp->lockname);
+}
+
+
+static void gdlm_bast(void *astarg, int mode)
+{
+ struct gdlm_lock *lp = astarg;
+
+ if (!mode) {
+ printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ return;
+ }
+
+ process_blocking(lp, mode);
+}
+
/* convert gfs lock-state to dlm lock-mode */
static s16 make_mode(s16 lmstate)
@@ -77,24 +321,6 @@ static s16 make_mode(s16 lmstate)
return -1;
}
-/* convert dlm lock-mode to gfs lock-state */
-
-s16 gdlm_make_lmstate(s16 dlmmode)
-{
- switch (dlmmode) {
- case DLM_LOCK_IV:
- case DLM_LOCK_NL:
- return LM_ST_UNLOCKED;
- case DLM_LOCK_EX:
- return LM_ST_EXCLUSIVE;
- case DLM_LOCK_CW:
- return LM_ST_DEFERRED;
- case DLM_LOCK_PR:
- return LM_ST_SHARED;
- }
- gdlm_assert(0, "unknown DLM mode %d", dlmmode);
- return -1;
-}
/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
@@ -134,14 +360,6 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
if (lp->lksb.sb_lkid != 0) {
lkf |= DLM_LKF_CONVERT;
-
- /* Conversion deadlock avoidance by DLM */
-
- if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
- !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
- !(lkf & DLM_LKF_NOQUEUE) &&
- cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
- lkf |= DLM_LKF_CONVDEADLK;
}
if (lp->lvb)
@@ -173,14 +391,9 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
make_strname(name, &lp->strname);
lp->ls = ls;
lp->cur = DLM_LOCK_IV;
- lp->lvb = NULL;
- lp->hold_null = NULL;
- INIT_LIST_HEAD(&lp->clist);
- INIT_LIST_HEAD(&lp->blist);
INIT_LIST_HEAD(&lp->delay_list);
spin_lock(&ls->async_lock);
- list_add(&lp->all_list, &ls->all_locks);
ls->all_locks_count++;
spin_unlock(&ls->async_lock);
@@ -188,26 +401,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
return 0;
}
-void gdlm_delete_lp(struct gdlm_lock *lp)
-{
- struct gdlm_ls *ls = lp->ls;
-
- spin_lock(&ls->async_lock);
- if (!list_empty(&lp->clist))
- list_del_init(&lp->clist);
- if (!list_empty(&lp->blist))
- list_del_init(&lp->blist);
- if (!list_empty(&lp->delay_list))
- list_del_init(&lp->delay_list);
- gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
- list_del_init(&lp->all_list);
- ls->all_locks_count--;
- spin_unlock(&ls->async_lock);
-
- kfree(lp);
-}
-
int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
void **lockp)
{
@@ -261,7 +454,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
lp->lksb.sb_status = -EAGAIN;
- queue_complete(lp);
+ gdlm_ast(lp);
error = 0;
}
@@ -308,6 +501,12 @@ unsigned int gdlm_lock(void *lock, unsigned int cur_state,
{
struct gdlm_lock *lp = lock;
+ if (req_state == LM_ST_UNLOCKED)
+ return gdlm_unlock(lock, cur_state);
+
+ if (req_state == LM_ST_UNLOCKED)
+ return gdlm_unlock(lock, cur_state);
+
clear_bit(LFL_DLM_CANCEL, &lp->flags);
if (flags & LM_FLAG_NOEXP)
set_bit(LFL_NOBLOCK, &lp->flags);
@@ -351,7 +550,7 @@ void gdlm_cancel(void *lock)
if (delay_list) {
set_bit(LFL_CANCEL, &lp->flags);
set_bit(LFL_ACTIVE, &lp->flags);
- queue_complete(lp);
+ gdlm_ast(lp);
return;
}
@@ -507,22 +706,3 @@ void gdlm_submit_delayed(struct gdlm_ls *ls)
wake_up(&ls->thread_wait);
}
-int gdlm_release_all_locks(struct gdlm_ls *ls)
-{
- struct gdlm_lock *lp, *safe;
- int count = 0;
-
- spin_lock(&ls->async_lock);
- list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
- list_del_init(&lp->all_list);
-
- if (lp->lvb && lp->lvb != junk_lvb)
- kfree(lp->lvb);
- kfree(lp);
- count++;
- }
- spin_unlock(&ls->async_lock);
-
- return count;
-}
-
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a243cf69c54e..3c98e7c6f93b 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -72,19 +72,12 @@ struct gdlm_ls {
int recover_jid_done;
int recover_jid_status;
spinlock_t async_lock;
- struct list_head complete;
- struct list_head blocking;
struct list_head delayed;
struct list_head submit;
- struct list_head all_locks;
u32 all_locks_count;
wait_queue_head_t wait_control;
- struct task_struct *thread1;
- struct task_struct *thread2;
+ struct task_struct *thread;
wait_queue_head_t thread_wait;
- unsigned long drop_time;
- int drop_locks_count;
- int drop_locks_period;
};
enum {
@@ -117,12 +110,7 @@ struct gdlm_lock {
u32 lkf; /* dlm flags DLM_LKF_ */
unsigned long flags; /* lock_dlm flags LFL_ */
- int bast_mode; /* protected by async_lock */
-
- struct list_head clist; /* complete */
- struct list_head blist; /* blocking */
struct list_head delay_list; /* delayed */
- struct list_head all_list; /* all locks for the fs */
struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
};
@@ -159,11 +147,7 @@ void gdlm_release_threads(struct gdlm_ls *);
/* lock.c */
-s16 gdlm_make_lmstate(s16);
-void gdlm_queue_delayed(struct gdlm_lock *);
void gdlm_submit_delayed(struct gdlm_ls *);
-int gdlm_release_all_locks(struct gdlm_ls *);
-void gdlm_delete_lp(struct gdlm_lock *);
unsigned int gdlm_do_lock(struct gdlm_lock *);
int gdlm_get_lock(void *, struct lm_lockname *, void **);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 470bdf650b50..09d78c216f48 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -22,22 +22,14 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
if (!ls)
return NULL;
- ls->drop_locks_count = GDLM_DROP_COUNT;
- ls->drop_locks_period = GDLM_DROP_PERIOD;
ls->fscb = cb;
ls->sdp = sdp;
ls->fsflags = flags;
spin_lock_init(&ls->async_lock);
- INIT_LIST_HEAD(&ls->complete);
- INIT_LIST_HEAD(&ls->blocking);
INIT_LIST_HEAD(&ls->delayed);
INIT_LIST_HEAD(&ls->submit);
- INIT_LIST_HEAD(&ls->all_locks);
init_waitqueue_head(&ls->thread_wait);
init_waitqueue_head(&ls->wait_control);
- ls->thread1 = NULL;
- ls->thread2 = NULL;
- ls->drop_time = jiffies;
ls->jid = -1;
strncpy(buf, table_name, 256);
@@ -180,7 +172,6 @@ out:
static void gdlm_unmount(void *lockspace)
{
struct gdlm_ls *ls = lockspace;
- int rv;
log_debug("unmount flags %lx", ls->flags);
@@ -194,9 +185,7 @@ static void gdlm_unmount(void *lockspace)
gdlm_kobject_release(ls);
dlm_release_lockspace(ls->dlm_lockspace, 2);
gdlm_release_threads(ls);
- rv = gdlm_release_all_locks(ls);
- if (rv)
- log_info("gdlm_unmount: %d stray locks freed", rv);
+ BUG_ON(ls->all_locks_count);
out:
kfree(ls);
}
@@ -232,7 +221,6 @@ static void gdlm_withdraw(void *lockspace)
dlm_release_lockspace(ls->dlm_lockspace, 2);
gdlm_release_threads(ls);
- gdlm_release_all_locks(ls);
gdlm_kobject_release(ls);
}
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a4ff271df9ee..4ec571c3d8a9 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -114,17 +114,6 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
return sprintf(buf, "%d\n", ls->recover_jid_status);
}
-static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf)
-{
- return sprintf(buf, "%d\n", ls->drop_locks_count);
-}
-
-static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
- ls->drop_locks_count = simple_strtol(buf, NULL, 0);
- return len;
-}
-
struct gdlm_attr {
struct attribute attr;
ssize_t (*show)(struct gdlm_ls *, char *);
@@ -144,7 +133,6 @@ GDLM_ATTR(first_done, 0444, first_done_show, NULL);
GDLM_ATTR(recover, 0644, recover_show, recover_store);
GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
-GDLM_ATTR(drop_count, 0644, drop_count_show, drop_count_store);
static struct attribute *gdlm_attrs[] = {
&gdlm_attr_proto_name.attr,
@@ -157,7 +145,6 @@ static struct attribute *gdlm_attrs[] = {
&gdlm_attr_recover.attr,
&gdlm_attr_recover_done.attr,
&gdlm_attr_recover_status.attr,
- &gdlm_attr_drop_count.attr,
NULL,
};
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index e53db6fd28ab..38823efd698c 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -9,367 +9,60 @@
#include "lock_dlm.h"
-/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
- thread gets to it. */
-
-static void queue_submit(struct gdlm_lock *lp)
-{
- struct gdlm_ls *ls = lp->ls;
-
- spin_lock(&ls->async_lock);
- list_add_tail(&lp->delay_list, &ls->submit);
- spin_unlock(&ls->async_lock);
- wake_up(&ls->thread_wait);
-}
-
-static void process_blocking(struct gdlm_lock *lp, int bast_mode)
-{
- struct gdlm_ls *ls = lp->ls;
- unsigned int cb = 0;
-
- switch (gdlm_make_lmstate(bast_mode)) {
- case LM_ST_EXCLUSIVE:
- cb = LM_CB_NEED_E;
- break;
- case LM_ST_DEFERRED:
- cb = LM_CB_NEED_D;
- break;
- case LM_ST_SHARED:
- cb = LM_CB_NEED_S;
- break;
- default:
- gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
- }
-
- ls->fscb(ls->sdp, cb, &lp->lockname);
-}
-
-static void wake_up_ast(struct gdlm_lock *lp)
-{
- clear_bit(LFL_AST_WAIT, &lp->flags);
- smp_mb__after_clear_bit();
- wake_up_bit(&lp->flags, LFL_AST_WAIT);
-}
-
-static void process_complete(struct gdlm_lock *lp)
-{
- struct gdlm_ls *ls = lp->ls;
- struct lm_async_cb acb;
- s16 prev_mode = lp->cur;
-
- memset(&acb, 0, sizeof(acb));
-
- if (lp->lksb.sb_status == -DLM_ECANCEL) {
- log_info("complete dlm cancel %x,%llx flags %lx",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number,
- lp->flags);
-
- lp->req = lp->cur;
- acb.lc_ret |= LM_OUT_CANCELED;
- if (lp->cur == DLM_LOCK_IV)
- lp->lksb.sb_lkid = 0;
- goto out;
- }
-
- if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
- if (lp->lksb.sb_status != -DLM_EUNLOCK) {
- log_info("unlock sb_status %d %x,%llx flags %lx",
- lp->lksb.sb_status, lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number,
- lp->flags);
- return;
- }
-
- lp->cur = DLM_LOCK_IV;
- lp->req = DLM_LOCK_IV;
- lp->lksb.sb_lkid = 0;
-
- if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
- gdlm_delete_lp(lp);
- return;
- }
- goto out;
- }
-
- if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
- memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
-
- if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
- if (lp->req == DLM_LOCK_PR)
- lp->req = DLM_LOCK_CW;
- else if (lp->req == DLM_LOCK_CW)
- lp->req = DLM_LOCK_PR;
- }
-
- /*
- * A canceled lock request. The lock was just taken off the delayed
- * list and was never even submitted to dlm.
- */
-
- if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
- log_info("complete internal cancel %x,%llx",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
- lp->req = lp->cur;
- acb.lc_ret |= LM_OUT_CANCELED;
- goto out;
- }
-
- /*
- * An error occured.
- */
-
- if (lp->lksb.sb_status) {
- /* a "normal" error */
- if ((lp->lksb.sb_status == -EAGAIN) &&
- (lp->lkf & DLM_LKF_NOQUEUE)) {
- lp->req = lp->cur;
- if (lp->cur == DLM_LOCK_IV)
- lp->lksb.sb_lkid = 0;
- goto out;
- }
-
- /* this could only happen with cancels I think */
- log_info("ast sb_status %d %x,%llx flags %lx",
- lp->lksb.sb_status, lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number,
- lp->flags);
- if (lp->lksb.sb_status == -EDEADLOCK &&
- lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
- lp->req = lp->cur;
- acb.lc_ret |= LM_OUT_CONV_DEADLK;
- if (lp->cur == DLM_LOCK_IV)
- lp->lksb.sb_lkid = 0;
- goto out;
- } else
- return;
- }
-
- /*
- * This is an AST for an EX->EX conversion for sync_lvb from GFS.
- */
-
- if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
- wake_up_ast(lp);
- return;
- }
-
- /*
- * A lock has been demoted to NL because it initially completed during
- * BLOCK_LOCKS. Now it must be requested in the originally requested
- * mode.
- */
-
- if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
- gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
- gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
-
- lp->cur = DLM_LOCK_NL;
- lp->req = lp->prev_req;
- lp->prev_req = DLM_LOCK_IV;
- lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
- set_bit(LFL_NOCACHE, &lp->flags);
-
- if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
- !test_bit(LFL_NOBLOCK, &lp->flags))
- gdlm_queue_delayed(lp);
- else
- queue_submit(lp);
- return;
- }
-
- /*
- * A request is granted during dlm recovery. It may be granted
- * because the locks of a failed node were cleared. In that case,
- * there may be inconsistent data beneath this lock and we must wait
- * for recovery to complete to use it. When gfs recovery is done this
- * granted lock will be converted to NL and then reacquired in this
- * granted state.
- */
-
- if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
- !test_bit(LFL_NOBLOCK, &lp->flags) &&
- lp->req != DLM_LOCK_NL) {
-
- lp->cur = lp->req;
- lp->prev_req = lp->req;
- lp->req = DLM_LOCK_NL;
- lp->lkf |= DLM_LKF_CONVERT;
- lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
- log_debug("rereq %x,%llx id %x %d,%d",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number,
- lp->lksb.sb_lkid, lp->cur, lp->req);
-
- set_bit(LFL_REREQUEST, &lp->flags);
- queue_submit(lp);
- return;
- }
-
- /*
- * DLM demoted the lock to NL before it was granted so GFS must be
- * told it cannot cache data for this lock.
- */
-
- if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
- set_bit(LFL_NOCACHE, &lp->flags);
-
-out:
- /*
- * This is an internal lock_dlm lock
- */
-
- if (test_bit(LFL_INLOCK, &lp->flags)) {
- clear_bit(LFL_NOBLOCK, &lp->flags);
- lp->cur = lp->req;
- wake_up_ast(lp);
- return;
- }
-
- /*
- * Normal completion of a lock request. Tell GFS it now has the lock.
- */
-
- clear_bit(LFL_NOBLOCK, &lp->flags);
- lp->cur = lp->req;
-
- acb.lc_name = lp->lockname;
- acb.lc_ret |= gdlm_make_lmstate(lp->cur);
-
- if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
- (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
- acb.lc_ret |= LM_OUT_CACHEABLE;
-
- ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
-}
-
-static inline int no_work(struct gdlm_ls *ls, int blocking)
+static inline int no_work(struct gdlm_ls *ls)
{
int ret;
spin_lock(&ls->async_lock);
- ret = list_empty(&ls->complete) && list_empty(&ls->submit);
- if (ret && blocking)
- ret = list_empty(&ls->blocking);
+ ret = list_empty(&ls->submit);
spin_unlock(&ls->async_lock);
return ret;
}
-static inline int check_drop(struct gdlm_ls *ls)
-{
- if (!ls->drop_locks_count)
- return 0;
-
- if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
- ls->drop_time = jiffies;
- if (ls->all_locks_count >= ls->drop_locks_count)
- return 1;
- }
- return 0;
-}
-
-static int gdlm_thread(void *data, int blist)
+static int gdlm_thread(void *data)
{
struct gdlm_ls *ls = (struct gdlm_ls *) data;
struct gdlm_lock *lp = NULL;
- uint8_t complete, blocking, submit, drop;
-
- /* Only thread1 is allowed to do blocking callbacks since gfs
- may wait for a completion callback within a blocking cb. */
while (!kthread_should_stop()) {
wait_event_interruptible(ls->thread_wait,
- !no_work(ls, blist) || kthread_should_stop());
-
- complete = blocking = submit = drop = 0;
+ !no_work(ls) || kthread_should_stop());
spin_lock(&ls->async_lock);
- if (blist && !list_empty(&ls->blocking)) {
- lp = list_entry(ls->blocking.next, struct gdlm_lock,
- blist);
- list_del_init(&lp->blist);
- blocking = lp->bast_mode;
- lp->bast_mode = 0;
- } else if (!list_empty(&ls->complete)) {
- lp = list_entry(ls->complete.next, struct gdlm_lock,
- clist);
- list_del_init(&lp->clist);
- complete = 1;
- } else if (!list_empty(&ls->submit)) {
+ if (!list_empty(&ls->submit)) {
lp = list_entry(ls->submit.next, struct gdlm_lock,
delay_list);
list_del_init(&lp->delay_list);
- submit = 1;
+ spin_unlock(&ls->async_lock);
+ gdlm_do_lock(lp);
+ spin_lock(&ls->async_lock);
}
-
- drop = check_drop(ls);
spin_unlock(&ls->async_lock);
-
- if (complete)
- process_complete(lp);
-
- else if (blocking)
- process_blocking(lp, blocking);
-
- else if (submit)
- gdlm_do_lock(lp);
-
- if (drop)
- ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
-
- schedule();
}
return 0;
}
-static int gdlm_thread1(void *data)
-{
- return gdlm_thread(data, 1);
-}
-
-static int gdlm_thread2(void *data)
-{
- return gdlm_thread(data, 0);
-}
-
int gdlm_init_threads(struct gdlm_ls *ls)
{
struct task_struct *p;
int error;
- p = kthread_run(gdlm_thread1, ls, "lock_dlm1");
- error = IS_ERR(p);
- if (error) {
- log_error("can't start lock_dlm1 thread %d", error);
- return error;
- }
- ls->thread1 = p;
-
- p = kthread_run(gdlm_thread2, ls, "lock_dlm2");
+ p = kthread_run(gdlm_thread, ls, "lock_dlm");
error = IS_ERR(p);
if (error) {
- log_error("can't start lock_dlm2 thread %d", error);
- kthread_stop(ls->thread1);
+ log_error("can't start lock_dlm thread %d", error);
return error;
}
- ls->thread2 = p;
+ ls->thread = p;
return 0;
}
void gdlm_release_threads(struct gdlm_ls *ls)
{
- kthread_stop(ls->thread1);
- kthread_stop(ls->thread2);
+ kthread_stop(ls->thread);
}
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
deleted file mode 100644
index 35e9730bc3a8..000000000000
--- a/fs/gfs2/locking/nolock/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
-lock_nolock-y := main.o
-
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
deleted file mode 100644
index 284a5ece8d94..000000000000
--- a/fs/gfs2/locking/nolock/main.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/lm_interface.h>
-
-struct nolock_lockspace {
- unsigned int nl_lvb_size;
-};
-
-static const struct lm_lockops nolock_ops;
-
-static int nolock_mount(char *table_name, char *host_data,
- lm_callback_t cb, void *cb_data,
- unsigned int min_lvb_size, int flags,
- struct lm_lockstruct *lockstruct,
- struct kobject *fskobj)
-{
- char *c;
- unsigned int jid;
- struct nolock_lockspace *nl;
-
- c = strstr(host_data, "jid=");
- if (!c)
- jid = 0;
- else {
- c += 4;
- sscanf(c, "%u", &jid);
- }
-
- nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
- if (!nl)
- return -ENOMEM;
-
- nl->nl_lvb_size = min_lvb_size;
-
- lockstruct->ls_jid = jid;
- lockstruct->ls_first = 1;
- lockstruct->ls_lvb_size = min_lvb_size;
- lockstruct->ls_lockspace = nl;
- lockstruct->ls_ops = &nolock_ops;
- lockstruct->ls_flags = LM_LSFLAG_LOCAL;
-
- return 0;
-}
-
-static void nolock_others_may_mount(void *lockspace)
-{
-}
-
-static void nolock_unmount(void *lockspace)
-{
- struct nolock_lockspace *nl = lockspace;
- kfree(nl);
-}
-
-static void nolock_withdraw(void *lockspace)
-{
-}
-
-/**
- * nolock_get_lock - get a lm_lock_t given a descripton of the lock
- * @lockspace: the lockspace the lock lives in
- * @name: the name of the lock
- * @lockp: return the lm_lock_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
- void **lockp)
-{
- *lockp = lockspace;
- return 0;
-}
-
-/**
- * nolock_put_lock - get rid of a lock structure
- * @lock: the lock to throw away
- *
- */
-
-static void nolock_put_lock(void *lock)
-{
-}
-
-/**
- * nolock_lock - acquire a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- * @req_state: the requested state
- * @flags: modifier flags
- *
- * Returns: A bitmap of LM_OUT_*
- */
-
-static unsigned int nolock_lock(void *lock, unsigned int cur_state,
- unsigned int req_state, unsigned int flags)
-{
- return req_state | LM_OUT_CACHEABLE;
-}
-
-/**
- * nolock_unlock - unlock a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- *
- * Returns: 0
- */
-
-static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
-{
- return 0;
-}
-
-static void nolock_cancel(void *lock)
-{
-}
-
-/**
- * nolock_hold_lvb - hold on to a lock value block
- * @lock: the lock the LVB is associated with
- * @lvbp: return the lm_lvb_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-static int nolock_hold_lvb(void *lock, char **lvbp)
-{
- struct nolock_lockspace *nl = lock;
- int error = 0;
-
- *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
- if (!*lvbp)
- error = -ENOMEM;
-
- return error;
-}
-
-/**
- * nolock_unhold_lvb - release a LVB
- * @lock: the lock the LVB is associated with
- * @lvb: the lock value block
- *
- */
-
-static void nolock_unhold_lvb(void *lock, char *lvb)
-{
- kfree(lvb);
-}
-
-static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
- struct file *file, struct file_lock *fl)
-{
- posix_test_lock(file, fl);
-
- return 0;
-}
-
-static int nolock_plock(void *lockspace, struct lm_lockname *name,
- struct file *file, int cmd, struct file_lock *fl)
-{
- int error;
- error = posix_lock_file_wait(file, fl);
- return error;
-}
-
-static int nolock_punlock(void *lockspace, struct lm_lockname *name,
- struct file *file, struct file_lock *fl)
-{
- int error;
- error = posix_lock_file_wait(file, fl);
- return error;
-}
-
-static void nolock_recovery_done(void *lockspace, unsigned int jid,
- unsigned int message)
-{
-}
-
-static const struct lm_lockops nolock_ops = {
- .lm_proto_name = "lock_nolock",
- .lm_mount = nolock_mount,
- .lm_others_may_mount = nolock_others_may_mount,
- .lm_unmount = nolock_unmount,
- .lm_withdraw = nolock_withdraw,
- .lm_get_lock = nolock_get_lock,
- .lm_put_lock = nolock_put_lock,
- .lm_lock = nolock_lock,
- .lm_unlock = nolock_unlock,
- .lm_cancel = nolock_cancel,
- .lm_hold_lvb = nolock_hold_lvb,
- .lm_unhold_lvb = nolock_unhold_lvb,
- .lm_plock_get = nolock_plock_get,
- .lm_plock = nolock_plock,
- .lm_punlock = nolock_punlock,
- .lm_recovery_done = nolock_recovery_done,
- .lm_owner = THIS_MODULE,
-};
-
-static int __init init_nolock(void)
-{
- int error;
-
- error = gfs2_register_lockproto(&nolock_ops);
- if (error) {
- printk(KERN_WARNING
- "lock_nolock: can't register protocol: %d\n", error);
- return error;
- }
-
- printk(KERN_INFO
- "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
- return 0;
-}
-
-static void __exit exit_nolock(void)
-{
- gfs2_unregister_lockproto(&nolock_ops);
-}
-
-module_init(init_nolock);
-module_exit(exit_nolock);
-
-MODULE_DESCRIPTION("GFS Nolock Locking Module");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 548264b1836d..6c6af9f5e3ab 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -87,6 +87,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
*/
static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+__releases(&sdp->sd_log_lock)
+__acquires(&sdp->sd_log_lock)
{
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 771152816508..7c64510ccfd2 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -21,6 +21,7 @@
*/
static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+__acquires(&sdp->sd_log_lock)
{
spin_lock(&sdp->sd_log_lock);
}
@@ -32,6 +33,7 @@ static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
*/
static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+__releases(&sdp->sd_log_lock)
{
spin_unlock(&sdp->sd_log_lock);
}
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 053e2ebbbd50..bcc668d0fadd 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -40,8 +40,6 @@ static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
INIT_HLIST_NODE(&gl->gl_list);
spin_lock_init(&gl->gl_spin);
INIT_LIST_HEAD(&gl->gl_holders);
- INIT_LIST_HEAD(&gl->gl_waiters1);
- INIT_LIST_HEAD(&gl->gl_waiters3);
gl->gl_lvb = NULL;
atomic_set(&gl->gl_lvb_count, 0);
INIT_LIST_HEAD(&gl->gl_reclaim);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 78d75f892f82..09853620c951 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -129,7 +129,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
}
/**
- * getbuf - Get a buffer with a given address space
+ * gfs2_getbuf - Get a buffer with a given address space
* @gl: the glock
* @blkno: the block number (filesystem scope)
* @create: 1 if the buffer should be created
@@ -137,7 +137,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
* Returns: the buffer
*/
-static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
{
struct address_space *mapping = gl->gl_aspace->i_mapping;
struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -205,7 +205,7 @@ static void meta_prep_new(struct buffer_head *bh)
struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
{
struct buffer_head *bh;
- bh = getbuf(gl, blkno, CREATE);
+ bh = gfs2_getbuf(gl, blkno, CREATE);
meta_prep_new(bh);
return bh;
}
@@ -223,7 +223,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
struct buffer_head **bhp)
{
- *bhp = getbuf(gl, blkno, CREATE);
+ *bhp = gfs2_getbuf(gl, blkno, CREATE);
if (!buffer_uptodate(*bhp)) {
ll_rw_block(READ_META, 1, bhp);
if (flags & DIO_WAIT) {
@@ -346,7 +346,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
struct buffer_head *bh;
while (blen) {
- bh = getbuf(ip->i_gl, bstart, NO_CREATE);
+ bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
if (bh) {
lock_buffer(bh);
gfs2_log_lock(sdp);
@@ -421,7 +421,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
if (extlen > max_ra)
extlen = max_ra;
- first_bh = getbuf(gl, dblock, CREATE);
+ first_bh = gfs2_getbuf(gl, dblock, CREATE);
if (buffer_uptodate(first_bh))
goto out;
@@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
extlen--;
while (extlen) {
- bh = getbuf(gl, dblock, CREATE);
+ bh = gfs2_getbuf(gl, dblock, CREATE);
if (!buffer_uptodate(bh) && !buffer_locked(bh))
ll_rw_block(READA, 1, &bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 73e3b1c76fe1..b1a5f3674d43 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -47,6 +47,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
int flags, struct buffer_head **bhp);
int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
int meta);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index f55394e57cb2..e64a1b04117a 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -499,34 +499,34 @@ static int __gfs2_readpage(void *file, struct page *page)
* @file: The file to read
* @page: The page of the file
*
- * This deals with the locking required. We use a trylock in order to
- * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
- * in the event that we are unable to get the lock.
+ * This deals with the locking required. We have to unlock and
+ * relock the page in order to get the locking in the right
+ * order.
*/
static int gfs2_readpage(struct file *file, struct page *page)
{
- struct gfs2_inode *ip = GFS2_I(page->mapping->host);
- struct gfs2_holder *gh;
+ struct address_space *mapping = page->mapping;
+ struct gfs2_inode *ip = GFS2_I(mapping->host);
+ struct gfs2_holder gh;
int error;
- gh = gfs2_glock_is_locked_by_me(ip->i_gl);
- if (!gh) {
- gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS);
- if (!gh)
- return -ENOBUFS;
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh);
+ unlock_page(page);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+ error = gfs2_glock_nq_atime(&gh);
+ if (unlikely(error))
+ goto out;
+ error = AOP_TRUNCATED_PAGE;
+ lock_page(page);
+ if (page->mapping == mapping && !PageUptodate(page))
+ error = __gfs2_readpage(file, page);
+ else
unlock_page(page);
- error = gfs2_glock_nq_atime(gh);
- if (likely(error != 0))
- goto out;
- return AOP_TRUNCATED_PAGE;
- }
- error = __gfs2_readpage(file, page);
- gfs2_glock_dq(gh);
+ gfs2_glock_dq(&gh);
out:
- gfs2_holder_uninit(gh);
- kfree(gh);
+ gfs2_holder_uninit(&gh);
+ if (error && error != AOP_TRUNCATED_PAGE)
+ lock_page(page);
return error;
}
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a066..e9a366d4411c 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -15,6 +15,7 @@
#include <linux/uio.h>
#include <linux/blkdev.h>
#include <linux/mm.h>
+#include <linux/mount.h>
#include <linux/fs.h>
#include <linux/gfs2_ondisk.h>
#include <linux/ext2_fs.h>
@@ -62,11 +63,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
&i_gh);
if (!error) {
- error = remote_llseek(file, offset, origin);
+ error = generic_file_llseek_unlocked(file, offset, origin);
gfs2_glock_dq_uninit(&i_gh);
}
} else
- error = remote_llseek(file, offset, origin);
+ error = generic_file_llseek_unlocked(file, offset, origin);
return error;
}
@@ -133,7 +134,6 @@ static const u32 fsflags_to_gfs2[32] = {
[7] = GFS2_DIF_NOATIME,
[12] = GFS2_DIF_EXHASH,
[14] = GFS2_DIF_INHERIT_JDATA,
- [20] = GFS2_DIF_INHERIT_DIRECTIO,
};
static const u32 gfs2_to_fsflags[32] = {
@@ -142,7 +142,6 @@ static const u32 gfs2_to_fsflags[32] = {
[gfs2fl_AppendOnly] = FS_APPEND_FL,
[gfs2fl_NoAtime] = FS_NOATIME_FL,
[gfs2fl_ExHash] = FS_INDEX_FL,
- [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
};
@@ -160,12 +159,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
return error;
fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
- if (!S_ISDIR(inode->i_mode)) {
- if (ip->i_di.di_flags & GFS2_DIF_JDATA)
- fsflags |= FS_JOURNAL_DATA_FL;
- if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
- fsflags |= FS_DIRECTIO_FL;
- }
+ if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
+ fsflags |= FS_JOURNAL_DATA_FL;
if (put_user(fsflags, ptr))
error = -EFAULT;
@@ -194,13 +189,11 @@ void gfs2_set_inode_flags(struct inode *inode)
/* Flags that can be set by user space */
#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
- GFS2_DIF_DIRECTIO| \
GFS2_DIF_IMMUTABLE| \
GFS2_DIF_APPENDONLY| \
GFS2_DIF_NOATIME| \
GFS2_DIF_SYNC| \
GFS2_DIF_SYSTEM| \
- GFS2_DIF_INHERIT_DIRECTIO| \
GFS2_DIF_INHERIT_JDATA)
/**
@@ -220,10 +213,14 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
int error;
u32 new_flags, flags;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ error = mnt_want_write(filp->f_path.mnt);
if (error)
return error;
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ if (error)
+ goto out_drop_write;
+
flags = ip->i_di.di_flags;
new_flags = (flags & ~mask) | (reqflags & mask);
if ((new_flags ^ flags) == 0)
@@ -242,7 +239,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
!capable(CAP_LINUX_IMMUTABLE))
goto out;
if (!IS_IMMUTABLE(inode)) {
- error = permission(inode, MAY_WRITE, NULL);
+ error = gfs2_permission(inode, MAY_WRITE);
if (error)
goto out;
}
@@ -272,6 +269,8 @@ out_trans_end:
gfs2_trans_end(sdp);
out:
gfs2_glock_dq_uninit(&gh);
+out_drop_write:
+ mnt_drop_write(filp->f_path.mnt);
return error;
}
@@ -285,8 +284,6 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
if (!S_ISDIR(inode->i_mode)) {
if (gfsflags & GFS2_DIF_INHERIT_JDATA)
gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
- if (gfsflags & GFS2_DIF_INHERIT_DIRECTIO)
- gfsflags ^= (GFS2_DIF_DIRECTIO | GFS2_DIF_INHERIT_DIRECTIO);
return do_gfs2_set_flags(filp, gfsflags, ~0);
}
return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
@@ -487,11 +484,6 @@ static int gfs2_open(struct inode *inode, struct file *file)
goto fail_gunlock;
}
- /* Listen to the Direct I/O flag */
-
- if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
- file->f_flags |= O_DIRECT;
-
gfs2_glock_dq_uninit(&i_gh);
}
@@ -669,8 +661,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
int error = 0;
state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
- flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE
- | GL_FLOCK;
+ flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
mutex_lock(&fp->f_fl_mutex);
@@ -683,9 +674,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
gfs2_glock_dq_wait(fl_gh);
gfs2_holder_reinit(state, flags, fl_gh);
} else {
- error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
- ip->i_no_addr, &gfs2_flock_glops,
- CREATE, &gl);
+ error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
+ &gfs2_flock_glops, CREATE, &gl);
if (error)
goto out;
gfs2_holder_init(gl, state, flags, fl_gh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b2028c82e8d1..b4d1d6490633 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -64,7 +64,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
mutex_init(&sdp->sd_rindex_mutex);
INIT_LIST_HEAD(&sdp->sd_rindex_list);
INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
- INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
INIT_LIST_HEAD(&sdp->sd_jindex_list);
spin_lock_init(&sdp->sd_jindex_spin);
@@ -364,6 +363,8 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
{
+ if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount)
+ return;
if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
sdp->sd_lockstruct.ls_lockspace);
@@ -741,8 +742,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
goto out;
}
- if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
- gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
+ if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
GFS2_MIN_LVB_SIZE)) {
gfs2_unmount_lockproto(&sdp->sd_lockstruct);
@@ -873,7 +873,7 @@ fail_sb:
fail_locking:
init_locking(sdp, &mount_gh, UNDO);
fail_lm:
- gfs2_gl_hash_clear(sdp, WAIT);
+ gfs2_gl_hash_clear(sdp);
gfs2_lm_unmount(sdp);
while (invalidate_inodes(sb))
yield();
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 2686ad4c0029..1e252dfc5294 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -163,7 +163,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (error)
goto out;
- error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+ error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
if (error)
goto out_gunlock;
@@ -669,7 +669,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
}
}
} else {
- error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
+ error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
if (error)
goto out_gunlock;
@@ -704,7 +704,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
/* Check out the dir to be renamed */
if (dir_rename) {
- error = permission(odentry->d_inode, MAY_WRITE, NULL);
+ error = gfs2_permission(odentry->d_inode, MAY_WRITE);
if (error)
goto out_gunlock;
}
@@ -891,7 +891,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
* Returns: errno
*/
-static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int gfs2_permission(struct inode *inode, int mask)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder i_gh;
@@ -905,13 +905,22 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
unlock = 1;
}
- error = generic_permission(inode, mask, gfs2_check_acl);
+ if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+ error = -EACCES;
+ else
+ error = generic_permission(inode, mask, gfs2_check_acl);
if (unlock)
gfs2_glock_dq_uninit(&i_gh);
return error;
}
+static int gfs2_iop_permission(struct inode *inode, int mask,
+ struct nameidata *nd)
+{
+ return gfs2_permission(inode, mask);
+}
+
static int setattr_size(struct inode *inode, struct iattr *attr)
{
struct gfs2_inode *ip = GFS2_I(inode);
@@ -1141,7 +1150,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
}
const struct inode_operations gfs2_file_iops = {
- .permission = gfs2_permission,
+ .permission = gfs2_iop_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
.setxattr = gfs2_setxattr,
@@ -1160,7 +1169,7 @@ const struct inode_operations gfs2_dir_iops = {
.rmdir = gfs2_rmdir,
.mknod = gfs2_mknod,
.rename = gfs2_rename,
- .permission = gfs2_permission,
+ .permission = gfs2_iop_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
.setxattr = gfs2_setxattr,
@@ -1172,7 +1181,7 @@ const struct inode_operations gfs2_dir_iops = {
const struct inode_operations gfs2_symlink_iops = {
.readlink = gfs2_readlink,
.follow_link = gfs2_follow_link,
- .permission = gfs2_permission,
+ .permission = gfs2_iop_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
.setxattr = gfs2_setxattr,
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 0b7cc920eb89..f66ea0f7a356 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -126,7 +126,7 @@ static void gfs2_put_super(struct super_block *sb)
gfs2_clear_rgrpd(sdp);
gfs2_jindex_free(sdp);
/* Take apart glock structures and buffer lists */
- gfs2_gl_hash_clear(sdp, WAIT);
+ gfs2_gl_hash_clear(sdp);
/* Unmount the locking protocol */
gfs2_lm_unmount(sdp);
@@ -155,7 +155,7 @@ static void gfs2_write_super(struct super_block *sb)
static int gfs2_sync_fs(struct super_block *sb, int wait)
{
sb->s_dirt = 0;
- if (wait)
+ if (wait && sb->s_fs_info)
gfs2_log_flush(sb->s_fs_info, NULL);
return 0;
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 56aaf915c59a..3e073f5144fa 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -904,7 +904,7 @@ static int need_sync(struct gfs2_quota_data *qd)
do_sync = 0;
else {
value *= gfs2_jindex_size(sdp) * num;
- do_div(value, den);
+ value = div_s64(value, den);
value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
do_sync = 0;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 2888e4b4b1c5..d5e91f4f6a0b 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -428,6 +428,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
unsigned int message)
{
+ if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done)
+ return;
+
if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
sdp->sd_lockstruct.ls_ops->lm_recovery_done(
sdp->sd_lockstruct.ls_lockspace, jid, message);
@@ -505,7 +508,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
- GL_NOCANCEL | GL_NOCACHE, &t_gh);
+ GL_NOCACHE, &t_gh);
if (error)
goto fail_gunlock_ji;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3401628d742b..2d90fb253505 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -371,11 +371,6 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
spin_lock(&sdp->sd_rindex_spin);
sdp->sd_rindex_forward = NULL;
- head = &sdp->sd_rindex_recent_list;
- while (!list_empty(head)) {
- rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
- list_del(&rgd->rd_recent);
- }
spin_unlock(&sdp->sd_rindex_spin);
head = &sdp->sd_rindex_list;
@@ -945,107 +940,30 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
}
/**
- * recent_rgrp_first - get first RG from "recent" list
- * @sdp: The GFS2 superblock
- * @rglast: address of the rgrp used last
- *
- * Returns: The first rgrp in the recent list
- */
-
-static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
- u64 rglast)
-{
- struct gfs2_rgrpd *rgd;
-
- spin_lock(&sdp->sd_rindex_spin);
-
- if (rglast) {
- list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
- if (rgrp_contains_block(rgd, rglast))
- goto out;
- }
- }
- rgd = NULL;
- if (!list_empty(&sdp->sd_rindex_recent_list))
- rgd = list_entry(sdp->sd_rindex_recent_list.next,
- struct gfs2_rgrpd, rd_recent);
-out:
- spin_unlock(&sdp->sd_rindex_spin);
- return rgd;
-}
-
-/**
* recent_rgrp_next - get next RG from "recent" list
* @cur_rgd: current rgrp
- * @remove:
*
* Returns: The next rgrp in the recent list
*/
-static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
- int remove)
+static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
{
struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
struct list_head *head;
struct gfs2_rgrpd *rgd;
spin_lock(&sdp->sd_rindex_spin);
-
- head = &sdp->sd_rindex_recent_list;
-
- list_for_each_entry(rgd, head, rd_recent) {
- if (rgd == cur_rgd) {
- if (cur_rgd->rd_recent.next != head)
- rgd = list_entry(cur_rgd->rd_recent.next,
- struct gfs2_rgrpd, rd_recent);
- else
- rgd = NULL;
-
- if (remove)
- list_del(&cur_rgd->rd_recent);
-
- goto out;
- }
+ head = &sdp->sd_rindex_mru_list;
+ if (unlikely(cur_rgd->rd_list_mru.next == head)) {
+ spin_unlock(&sdp->sd_rindex_spin);
+ return NULL;
}
-
- rgd = NULL;
- if (!list_empty(head))
- rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
-
-out:
+ rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
spin_unlock(&sdp->sd_rindex_spin);
return rgd;
}
/**
- * recent_rgrp_add - add an RG to tail of "recent" list
- * @new_rgd: The rgrp to add
- *
- */
-
-static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
-{
- struct gfs2_sbd *sdp = new_rgd->rd_sbd;
- struct gfs2_rgrpd *rgd;
- unsigned int count = 0;
- unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
-
- spin_lock(&sdp->sd_rindex_spin);
-
- list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
- if (rgd == new_rgd)
- goto out;
-
- if (++count >= max)
- goto out;
- }
- list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
-
-out:
- spin_unlock(&sdp->sd_rindex_spin);
-}
-
-/**
* forward_rgrp_get - get an rgrp to try next from full list
* @sdp: The GFS2 superblock
*
@@ -1112,9 +1030,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
int loops = 0;
int error, rg_locked;
- /* Try recently successful rgrps */
-
- rgd = recent_rgrp_first(sdp, ip->i_goal);
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
while (rgd) {
rg_locked = 0;
@@ -1136,11 +1052,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
if (inode)
return inode;
- rgd = recent_rgrp_next(rgd, 1);
- break;
-
+ /* fall through */
case GLR_TRYFAILED:
- rgd = recent_rgrp_next(rgd, 0);
+ rgd = recent_rgrp_next(rgd);
break;
default:
@@ -1199,7 +1113,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
out:
if (begin) {
- recent_rgrp_add(rgd);
+ spin_lock(&sdp->sd_rindex_spin);
+ list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+ spin_unlock(&sdp->sd_rindex_spin);
rgd = gfs2_rgrpd_get_next(rgd);
if (!rgd)
rgd = gfs2_rgrpd_get_first(sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7aeacbc65f35..63a8a902d9db 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -65,7 +65,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
gt->gt_quota_quantum = 60;
gt->gt_atime_quantum = 3600;
gt->gt_new_files_jdata = 0;
- gt->gt_new_files_directio = 0;
gt->gt_max_readahead = 1 << 18;
gt->gt_stall_secs = 600;
gt->gt_complain_secs = 10;
@@ -941,8 +940,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
}
error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
- LM_FLAG_PRIORITY | GL_NOCACHE,
- t_gh);
+ GL_NOCACHE, t_gh);
list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
error = gfs2_jdesc_check(jd);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9ab9fc85ecd0..74846559fc3f 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -110,18 +110,6 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
return len;
}
-static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
-
- if (simple_strtol(buf, NULL, 0) != 1)
- return -EINVAL;
-
- gfs2_gl_hash_clear(sdp, NO_WAIT);
- return len;
-}
-
static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
@@ -175,7 +163,6 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
GFS2_ATTR(id, 0444, id_show, NULL);
GFS2_ATTR(fsname, 0444, fsname_show, NULL);
GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
-GFS2_ATTR(shrink, 0200, NULL, shrink_store);
GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
@@ -186,7 +173,6 @@ static struct attribute *gfs2_attrs[] = {
&gfs2_attr_id.attr,
&gfs2_attr_fsname.attr,
&gfs2_attr_freeze.attr,
- &gfs2_attr_shrink.attr,
&gfs2_attr_withdraw.attr,
&gfs2_attr_statfs_sync.attr,
&gfs2_attr_quota_sync.attr,
@@ -426,7 +412,6 @@ TUNE_ATTR(max_readahead, 0);
TUNE_ATTR(complain_secs, 0);
TUNE_ATTR(statfs_slow, 0);
TUNE_ATTR(new_files_jdata, 0);
-TUNE_ATTR(new_files_directio, 0);
TUNE_ATTR(quota_simul_sync, 1);
TUNE_ATTR(quota_cache_secs, 1);
TUNE_ATTR(stall_secs, 1);
@@ -455,7 +440,6 @@ static struct attribute *tune_attrs[] = {
&tune_attr_quotad_secs.attr,
&tune_attr_quota_scale.attr,
&tune_attr_new_files_jdata.attr,
- &tune_attr_new_files_directio.attr,
NULL,
};
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022ce..91389c8aee8a 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
- J_ASSERT(transaction->t_sync_datalist == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7ceee..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
}
/*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
* After the transaction commits, these pages are left on the LRU, with no
* ->mapping, and with attached buffers. These pages are trivially reclaimable
* by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
}
/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held. For ranking reasons we must trylock. If we lose, schedule away and
- * return 0. j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
- if (!jbd_trylock_bh_state(bh)) {
- spin_unlock(&journal->j_list_lock);
- schedule();
- return 0;
- }
- return 1;
-}
-
-/*
* Done it all: now submit the commit record. We should have
* cleaned up our previous buffers by now, so if we are in abort
* mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
struct buffer_head *bh;
int ret;
int barrier_done = 0;
+ struct timespec now = current_kernel_time();
if (is_journal_aborted(journal))
return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+ tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+ tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
if (JBD2_HAS_COMPAT_FEATURE(journal,
JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
}
/*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
*/
-static int journal_wait_on_locked_list(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
{
- int ret = 0;
- struct journal_head *jh;
-
- while (commit_transaction->t_locked_list) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_locked_list->b_tprev;
- bh = jh2bh(jh);
- get_bh(bh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- if (unlikely(!buffer_uptodate(bh)))
- ret = -EIO;
- spin_lock(&journal->j_list_lock);
- }
- if (!inverted_lock(journal, bh)) {
- put_bh(bh);
- spin_lock(&journal->j_list_lock);
- continue;
- }
- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
- __jbd2_journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- jbd2_journal_remove_journal_head(bh);
- put_bh(bh);
- } else {
- jbd_unlock_bh_state(bh);
- }
- put_bh(bh);
- cond_resched_lock(&journal->j_list_lock);
- }
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = mapping->nrpages * 2,
+ .range_start = 0,
+ .range_end = i_size_read(mapping->host),
+ .for_writepages = 1,
+ };
+
+ ret = generic_writepages(mapping, &wbc);
return ret;
- }
+}
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- int i;
+ struct jbd2_inode *jinode;
+ int err, ret = 0;
+ struct address_space *mapping;
- for (i = 0; i < bufs; i++) {
- wbuf[i]->b_end_io = end_buffer_write_sync;
- /* We use-up our safety reference in submit_bh() */
- submit_bh(WRITE, wbuf[i]);
+ spin_lock(&journal->j_list_lock);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ mapping = jinode->i_vfs_inode->i_mapping;
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ /*
+ * submit the inode data buffers. We use writepage
+ * instead of writepages. Because writepages can do
+ * block allocation with delalloc. We need to write
+ * only allocated blocks here.
+ */
+ err = journal_submit_inode_data_buffers(mapping);
+ if (!ret)
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ J_ASSERT(jinode->i_transaction == commit_transaction);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
+ spin_unlock(&journal->j_list_lock);
+ return ret;
}
/*
- * Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
*/
-static void journal_submit_data_buffers(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_finish_inode_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- struct journal_head *jh;
- struct buffer_head *bh;
- int locked;
- int bufs = 0;
- struct buffer_head **wbuf = journal->j_wbuf;
+ struct jbd2_inode *jinode, *next_i;
+ int err, ret = 0;
- /*
- * Whenever we unlock the journal and sleep, things can get added
- * onto ->t_sync_datalist, so we have to keep looping back to
- * write_out_data until we *know* that the list is empty.
- *
- * Cleanup any flushed data buffers from the data list. Even in
- * abort mode, we want to flush this out as soon as possible.
- */
-write_out_data:
- cond_resched();
+ /* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+ if (!ret)
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ }
- while (commit_transaction->t_sync_datalist) {
- jh = commit_transaction->t_sync_datalist;
- bh = jh2bh(jh);
- locked = 0;
-
- /* Get reference just to make sure buffer does not disappear
- * when we are forced to drop various locks */
- get_bh(bh);
- /* If the buffer is dirty, we need to submit IO and hence
- * we need the buffer lock. We try to lock the buffer without
- * blocking. If we fail, we need to drop j_list_lock and do
- * blocking lock_buffer().
- */
- if (buffer_dirty(bh)) {
- if (test_set_buffer_locked(bh)) {
- BUFFER_TRACE(bh, "needs blocking lock");
- spin_unlock(&journal->j_list_lock);
- /* Write out all data to prevent deadlocks */
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- lock_buffer(bh);
- spin_lock(&journal->j_list_lock);
- }
- locked = 1;
- }
- /* We have to get bh_state lock. Again out of order, sigh. */
- if (!inverted_lock(journal, bh)) {
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- }
- /* Someone already cleaned up the buffer? */
- if (!buffer_jbd(bh)
- || jh->b_transaction != commit_transaction
- || jh->b_jlist != BJ_SyncData) {
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "already cleaned up");
- put_bh(bh);
- continue;
- }
- if (locked && test_clear_buffer_dirty(bh)) {
- BUFFER_TRACE(bh, "needs writeout, adding to array");
- wbuf[bufs++] = bh;
- __jbd2_journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- if (bufs == journal->j_wbufsize) {
- spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- goto write_out_data;
- }
- } else if (!locked && buffer_locked(bh)) {
- __jbd2_journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- put_bh(bh);
+ /* Now refile inode to proper lists */
+ list_for_each_entry_safe(jinode, next_i,
+ &commit_transaction->t_inode_list, i_list) {
+ list_del(&jinode->i_list);
+ if (jinode->i_next_transaction) {
+ jinode->i_transaction = jinode->i_next_transaction;
+ jinode->i_next_transaction = NULL;
+ list_add(&jinode->i_list,
+ &jinode->i_transaction->t_inode_list);
} else {
- BUFFER_TRACE(bh, "writeout complete: unfile");
- __jbd2_journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- jbd2_journal_remove_journal_head(bh);
- /* Once for our safety reference, once for
- * jbd2_journal_remove_journal_head() */
- put_bh(bh);
- put_bh(bh);
- }
-
- if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
- spin_unlock(&journal->j_list_lock);
- goto write_out_data;
+ jinode->i_transaction = NULL;
}
}
spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
+
+ return ret;
}
static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
- err = 0;
- journal_submit_data_buffers(journal, commit_transaction);
-
- /*
- * Wait for all previously submitted IO to complete if commit
- * record is to be written synchronously.
- */
- spin_lock(&journal->j_list_lock);
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
- err = journal_wait_on_locked_list(journal,
- commit_transaction);
-
- spin_unlock(&journal->j_list_lock);
-
+ err = journal_submit_data_buffers(journal, commit_transaction);
if (err)
jbd2_journal_abort(journal, err);
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(3, "JBD: commit phase 2\n");
/*
- * If we found any dirty or locked buffers, then we should have
- * looped back up to the write_out_data label. If there weren't
- * any then journal_clean_data_list should have wiped the list
- * clean by now, so check that it is in fact empty.
- */
- J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
- jbd_debug (3, "JBD: commit phase 3\n");
-
- /*
* Way to go: we have now written out all of the data for a
* transaction! Now comes the tricky part: we need to write out
* metadata. Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(commit_transaction->t_nr_buffers <=
commit_transaction->t_outstanding_credits);
+ err = 0;
descriptor = NULL;
bufs = 0;
while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
&cbh, crc32_sum);
if (err)
__jbd2_journal_abort_hard(journal);
-
- spin_lock(&journal->j_list_lock);
- err = journal_wait_on_locked_list(journal,
- commit_transaction);
- spin_unlock(&journal->j_list_lock);
- if (err)
- __jbd2_journal_abort_hard(journal);
}
+ /*
+ * This is the right place to wait for data buffers both for ASYNC
+ * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+ * the commit block went to disk (which happens above). If commit is
+ * SYNC, we need to wait for data buffers before we start writing
+ * commit block, which happens below in such setting.
+ */
+ err = journal_finish_inode_data_buffers(journal, commit_transaction);
+ if (err)
+ jbd2_journal_abort(journal, err);
+
/* Lo and behold: we have just managed to send a transaction to
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
so we incur less scheduling load.
*/
- jbd_debug(3, "JBD: commit phase 4\n");
+ jbd_debug(3, "JBD: commit phase 3\n");
/*
* akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
J_ASSERT (commit_transaction->t_shadow_list == NULL);
- jbd_debug(3, "JBD: commit phase 5\n");
+ jbd_debug(3, "JBD: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
/* AKPM: bforget here */
}
- jbd_debug(3, "JBD: commit phase 6\n");
+ jbd_debug(3, "JBD: commit phase 5\n");
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
transaction can be removed from any checkpoint list it was on
before. */
- jbd_debug(3, "JBD: commit phase 7\n");
+ jbd_debug(3, "JBD: commit phase 6\n");
- J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
/* Done with this transaction! */
- jbd_debug(3, "JBD: commit phase 8\n");
+ jbd_debug(3, "JBD: commit phase 7\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a79..b26c6d9fe6ae 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
EXPORT_SYMBOL(jbd2_journal_get_write_access);
EXPORT_SYMBOL(jbd2_journal_get_create_access);
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
EXPORT_SYMBOL(jbd2_journal_release_buffer);
EXPORT_SYMBOL(jbd2_journal_forget);
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
EXPORT_SYMBOL(jbd2_journal_invalidatepage);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
}
/*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+ jinode->i_transaction = NULL;
+ jinode->i_next_transaction = NULL;
+ jinode->i_vfs_inode = inode;
+ jinode->i_flags = 0;
+ INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+ struct jbd2_inode *jinode)
+{
+ int writeout = 0;
+
+ if (!journal)
+ return;
+restart:
+ spin_lock(&journal->j_list_lock);
+ /* Is commit writing out inode - we have to wait */
+ if (jinode->i_flags & JI_COMMIT_RUNNING) {
+ wait_queue_head_t *wq;
+ DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+ wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&journal->j_list_lock);
+ schedule();
+ finish_wait(wq, &wait.wait);
+ goto restart;
+ }
+
+ /* Do we need to wait for data writeback? */
+ if (journal->j_committing_transaction == jinode->i_transaction)
+ writeout = 1;
+ if (jinode->i_transaction) {
+ list_del(&jinode->i_list);
+ jinode->i_transaction = NULL;
+ }
+ spin_unlock(&journal->j_list_lock);
+}
+
+/*
* debugfs tunables
*/
#ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e67804..4f7cadbb19fa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
* new transaction and we can't block without protecting against other
* processes trying to touch the journal while it is in transition.
*
- * Called under j_state_lock
*/
static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
+ INIT_LIST_HEAD(&transaction->t_inode_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
}
/**
- * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
- * needs to be flushed before we can commit the
- * current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- journal_t *journal = handle->h_transaction->t_journal;
- int need_brelse = 0;
- struct journal_head *jh;
-
- if (is_handle_aborted(handle))
- return 0;
-
- jh = jbd2_journal_add_journal_head(bh);
- JBUFFER_TRACE(jh, "entry");
-
- /*
- * The buffer could *already* be dirty. Writeout can start
- * at any time.
- */
- jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
- /*
- * What if the buffer is already part of a running transaction?
- *
- * There are two cases:
- * 1) It is part of the current running transaction. Refile it,
- * just in case we have allocated it as metadata, deallocated
- * it, then reallocated it as data.
- * 2) It is part of the previous, still-committing transaction.
- * If all we want to do is to guarantee that the buffer will be
- * written to disk before this new transaction commits, then
- * being sure that the *previous* transaction has this same
- * property is sufficient for us! Just leave it on its old
- * transaction.
- *
- * In case (2), the buffer must not already exist as metadata
- * --- that would violate write ordering (a transaction is free
- * to write its data at any point, even before the previous
- * committing transaction has committed). The caller must
- * never, ever allow this to happen: there's nothing we can do
- * about it in this layer.
- */
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
-
- /* Now that we have bh_state locked, are we really still mapped? */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
- goto no_journal;
- }
-
- if (jh->b_transaction) {
- JBUFFER_TRACE(jh, "has transaction");
- if (jh->b_transaction != handle->h_transaction) {
- JBUFFER_TRACE(jh, "belongs to older transaction");
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_committing_transaction);
-
- /* @@@ IS THIS TRUE ? */
- /*
- * Not any more. Scenario: someone does a write()
- * in data=journal mode. The buffer's transaction has
- * moved into commit. Then someone does another
- * write() to the file. We do the frozen data copyout
- * and set b_next_transaction to point to j_running_t.
- * And while we're in that state, someone does a
- * writepage() in an attempt to pageout the same area
- * of the file via a shared mapping. At present that
- * calls jbd2_journal_dirty_data(), and we get right here.
- * It may be too late to journal the data. Simply
- * falling through to the next test will suffice: the
- * data will be dirty and wil be checkpointed. The
- * ordering comments in the next comment block still
- * apply.
- */
- //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
- /*
- * If we're journalling data, and this buffer was
- * subject to a write(), it could be metadata, forget
- * or shadow against the committing transaction. Now,
- * someone has dirtied the same darn page via a mapping
- * and it is being writepage()'d.
- * We *could* just steal the page from commit, with some
- * fancy locking there. Instead, we just skip it -
- * don't tie the page's buffers to the new transaction
- * at all.
- * Implication: if we crash before the writepage() data
- * is written into the filesystem, recovery will replay
- * the write() data.
- */
- if (jh->b_jlist != BJ_None &&
- jh->b_jlist != BJ_SyncData &&
- jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "Not stealing");
- goto no_journal;
- }
-
- /*
- * This buffer may be undergoing writeout in commit. We
- * can't return from here and let the caller dirty it
- * again because that can cause the write-out loop in
- * commit to never terminate.
- */
- if (buffer_dirty(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- need_brelse = 1;
- sync_dirty_buffer(bh);
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- /* Since we dropped the lock... */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "buffer got unmapped");
- goto no_journal;
- }
- /* The buffer may become locked again at any
- time if it is redirtied */
- }
-
- /* journal_clean_data_list() may have got there first */
- if (jh->b_transaction != NULL) {
- JBUFFER_TRACE(jh, "unfile from commit");
- __jbd2_journal_temp_unlink_buffer(jh);
- /* It still points to the committing
- * transaction; move it to this one so
- * that the refile assert checks are
- * happy. */
- jh->b_transaction = handle->h_transaction;
- }
- /* The buffer will be refiled below */
-
- }
- /*
- * Special case --- the buffer might actually have been
- * allocated and then immediately deallocated in the previous,
- * committing transaction, so might still be left on that
- * transaction's metadata lists.
- */
- if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "not on correct data list: unfile");
- J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
- __jbd2_journal_temp_unlink_buffer(jh);
- jh->b_transaction = handle->h_transaction;
- JBUFFER_TRACE(jh, "file as data");
- __jbd2_journal_file_buffer(jh, handle->h_transaction,
- BJ_SyncData);
- }
- } else {
- JBUFFER_TRACE(jh, "not on a transaction");
- __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
- }
-no_journal:
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- if (need_brelse) {
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- }
- JBUFFER_TRACE(jh, "exit");
- jbd2_journal_put_journal_head(jh);
- return 0;
-}
-
-/**
* int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
+ * of these pointers, it could go bad. Generally the caller needs to re-read
+ * the pointer from the transaction_t.
*
* Called under j_list_lock. The journal may not be locked.
*/
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
switch (jh->b_jlist) {
case BJ_None:
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers--;
J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
goto out;
spin_lock(&journal->j_list_lock);
- if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
- if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
- /* A written-back ordered data buffer */
- JBUFFER_TRACE(jh, "release data");
- __jbd2_journal_unfile_buffer(jh);
- jbd2_journal_remove_journal_head(bh);
- __brelse(bh);
- }
- } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+ if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
/* written-back checkpointed metadata buffer */
if (jh->b_jlist == BJ_None) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
return;
}
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction) {
+ spin_unlock(&journal->j_state_lock);
+ return;
+ }
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+}
/**
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
/*
* We take our own ref against the journal_head here to avoid
* having to add tons of locking around each instance of
- * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+ * jbd2_journal_remove_journal_head() and
+ * jbd2_journal_put_journal_head().
*/
jh = jbd2_journal_grab_journal_head(bh);
if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where jbd2_journal_try_to_free_buffers()
+ * could race with jbd2_journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+ jbd2_journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ }
+
busy:
return ret;
}
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
if (!buffer_jbd(bh))
goto zap_buffer_unlocked;
+ /* OK, we have data buffer in journaled mode */
spin_lock(&journal->j_state_lock);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
} else if (transaction == journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "on committing transaction");
- if (jh->b_jlist == BJ_Locked) {
- /*
- * The buffer is on the committing transaction's locked
- * list. We have the buffer locked, so I/O has
- * completed. So we can nail the buffer now.
- */
- may_free = __dispose_buffer(jh, transaction);
- goto zap_buffer;
- }
/*
* If it is committing, we simply cannot touch it. We
* can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
J_ASSERT_JH(jh, !jh->b_committed_data);
J_ASSERT_JH(jh, !jh->b_frozen_data);
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers++;
list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
spin_unlock(&journal->j_list_lock);
__brelse(bh);
}
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+
+ if (is_handle_aborted(handle))
+ return -EIO;
+
+ jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+ transaction->t_tid);
+
+ /*
+ * First check whether inode isn't already on the transaction's
+ * lists without taking the lock. Note that this check is safe
+ * without the lock as we cannot race with somebody removing inode
+ * from the transaction. The reason is that we remove inode from the
+ * transaction only in journal_release_jbd_inode() and when we commit
+ * the transaction. We are guarded from the first case by holding
+ * a reference to the inode. We are safe against the second case
+ * because if jinode->i_transaction == transaction, commit code
+ * cannot touch the transaction because we hold reference to it,
+ * and if jinode->i_next_transaction == transaction, commit code
+ * will only file the inode where we want it.
+ */
+ if (jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction)
+ return 0;
+
+ spin_lock(&journal->j_list_lock);
+
+ if (jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction)
+ goto done;
+
+ /* On some different transaction's list - should be
+ * the committing one */
+ if (jinode->i_transaction) {
+ J_ASSERT(jinode->i_next_transaction == NULL);
+ J_ASSERT(jinode->i_transaction ==
+ journal->j_committing_transaction);
+ jinode->i_next_transaction = transaction;
+ goto done;
+ }
+ /* Not on any transaction list... */
+ J_ASSERT(!jinode->i_next_transaction);
+ jinode->i_transaction = transaction;
+ list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+ spin_unlock(&journal->j_list_lock);
+
+ return 0;
+}
+
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+ loff_t new_size)
+{
+ journal_t *journal;
+ transaction_t *commit_trans;
+ int ret = 0;
+
+ if (!inode->i_transaction && !inode->i_next_transaction)
+ goto out;
+ journal = inode->i_transaction->t_journal;
+ spin_lock(&journal->j_state_lock);
+ commit_trans = journal->j_committing_transaction;
+ spin_unlock(&journal->j_state_lock);
+ if (inode->i_transaction == commit_trans) {
+ ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+ new_size, LLONG_MAX);
+ if (ret)
+ jbd2_journal_abort(journal, ret);
+ }
+out:
+ return ret;
+}
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index bf6ab19b86ee..6a73de84bcef 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -21,6 +21,7 @@
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include <asm/uaccess.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
@@ -30,29 +31,19 @@
static struct proc_dir_entry *base;
#ifdef CONFIG_JFS_DEBUG
-static int loglevel_read(char *page, char **start, off_t off,
- int count, int *eof, void *data)
+static int jfs_loglevel_proc_show(struct seq_file *m, void *v)
{
- int len;
-
- len = sprintf(page, "%d\n", jfsloglevel);
-
- len -= off;
- *start = page + off;
-
- if (len > count)
- len = count;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
+ seq_printf(m, "%d\n", jfsloglevel);
+ return 0;
+}
- return len;
+static int jfs_loglevel_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_loglevel_proc_show, NULL);
}
-static int loglevel_write(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+static ssize_t jfs_loglevel_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
{
char c;
@@ -65,22 +56,30 @@ static int loglevel_write(struct file *file, const char __user *buffer,
jfsloglevel = c - '0';
return count;
}
+
+static const struct file_operations jfs_loglevel_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_loglevel_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = jfs_loglevel_proc_write,
+};
#endif
static struct {
const char *name;
- read_proc_t *read_fn;
- write_proc_t *write_fn;
+ const struct file_operations *proc_fops;
} Entries[] = {
#ifdef CONFIG_JFS_STATISTICS
- { "lmstats", jfs_lmstats_read, },
- { "txstats", jfs_txstats_read, },
- { "xtstat", jfs_xtstat_read, },
- { "mpstat", jfs_mpstat_read, },
+ { "lmstats", &jfs_lmstats_proc_fops, },
+ { "txstats", &jfs_txstats_proc_fops, },
+ { "xtstat", &jfs_xtstat_proc_fops, },
+ { "mpstat", &jfs_mpstat_proc_fops, },
#endif
#ifdef CONFIG_JFS_DEBUG
- { "TxAnchor", jfs_txanchor_read, },
- { "loglevel", loglevel_read, loglevel_write }
+ { "TxAnchor", &jfs_txanchor_proc_fops, },
+ { "loglevel", &jfs_loglevel_proc_fops }
#endif
};
#define NPROCENT ARRAY_SIZE(Entries)
@@ -93,13 +92,8 @@ void jfs_proc_init(void)
return;
base->owner = THIS_MODULE;
- for (i = 0; i < NPROCENT; i++) {
- struct proc_dir_entry *p;
- if ((p = create_proc_entry(Entries[i].name, 0, base))) {
- p->read_proc = Entries[i].read_fn;
- p->write_proc = Entries[i].write_fn;
- }
- }
+ for (i = 0; i < NPROCENT; i++)
+ proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
}
void jfs_proc_clean(void)
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index 044c1e654cc0..eafd1300a00b 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,7 @@ extern void jfs_proc_clean(void);
extern int jfsloglevel;
-extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_txanchor_proc_fops;
/* information message: e.g., configuration, major event */
#define jfs_info(fmt, arg...) do { \
@@ -105,10 +105,10 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
* ----------
*/
#ifdef CONFIG_JFS_STATISTICS
-extern int jfs_lmstats_read(char *, char **, off_t, int, int *, void *);
-extern int jfs_txstats_read(char *, char **, off_t, int, int *, void *);
-extern int jfs_mpstat_read(char *, char **, off_t, int, int *, void *);
-extern int jfs_xtstat_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_lmstats_proc_fops;
+extern const struct file_operations jfs_txstats_proc_fops;
+extern const struct file_operations jfs_mpstat_proc_fops;
+extern const struct file_operations jfs_xtstat_proc_fops;
#define INCREMENT(x) ((x)++)
#define DECREMENT(x) ((x)--)
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index cdac2d5bafeb..2545bb317235 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -243,9 +243,6 @@ typedef union {
#define JFS_REMOVE 3
#define JFS_RENAME 4
-#define DIRENTSIZ(namlen) \
- ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 )
-
/*
* Maximum file offset for directories.
*/
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 734ec916beaf..d6363d8309d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1520,7 +1520,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
jfs_error(ip->i_sb,
"diAlloc: can't find free bit "
"in wmap");
- return EIO;
+ return -EIO;
}
/* determine the inode number within the
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 325a9679b95a..cd2ec2988b59 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -69,6 +69,7 @@
#include <linux/freezer.h>
#include <linux/delay.h>
#include <linux/mutex.h>
+#include <linux/seq_file.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
#include "jfs_metapage.h"
@@ -2503,13 +2504,9 @@ exit:
}
#ifdef CONFIG_JFS_STATISTICS
-int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
-
- len += sprintf(buffer,
+ seq_printf(m,
"JFS Logmgr stats\n"
"================\n"
"commits = %d\n"
@@ -2522,19 +2519,19 @@ int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
lmStat.pagedone,
lmStat.full_page,
lmStat.partial_page);
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_lmstats_proc_show, NULL);
}
+
+const struct file_operations jfs_lmstats_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_lmstats_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index d1e64f2f2fcd..854ff0ec574f 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -19,10 +19,12 @@
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/module.h>
#include <linux/bio.h>
#include <linux/init.h>
#include <linux/buffer_head.h>
#include <linux/mempool.h>
+#include <linux/seq_file.h>
#include "jfs_incore.h"
#include "jfs_superblock.h"
#include "jfs_filsys.h"
@@ -804,13 +806,9 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
}
#ifdef CONFIG_JFS_STATISTICS
-int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
-
- len += sprintf(buffer,
+ seq_printf(m,
"JFS Metapage statistics\n"
"=======================\n"
"page allocations = %d\n"
@@ -819,19 +817,19 @@ int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
mpStat.pagealloc,
mpStat.pagefree,
mpStat.lockwait);
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_mpstat_proc_show, NULL);
}
+
+const struct file_operations jfs_mpstat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_mpstat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index e7c60ae6b5b2..f26e4d03ada5 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -49,6 +49,7 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kthread.h>
+#include <linux/seq_file.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_filsys.h"
@@ -3009,11 +3010,8 @@ int jfs_sync(void *arg)
}
#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
-int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
char *freewait;
char *freelockwait;
char *lowlockwait;
@@ -3025,7 +3023,7 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
lowlockwait =
waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
- len += sprintf(buffer,
+ seq_printf(m,
"JFS TxAnchor\n"
"============\n"
"freetid = %d\n"
@@ -3044,31 +3042,27 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
TxAnchor.tlocksInUse,
jfs_tlocks_low,
list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_txanchor_proc_show, NULL);
}
+
+const struct file_operations jfs_txanchor_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_txanchor_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
-int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_txstats_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
-
- len += sprintf(buffer,
+ seq_printf(m,
"JFS TxStats\n"
"===========\n"
"calls to txBegin = %d\n"
@@ -3089,19 +3083,19 @@ int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
TxStat.txBeginAnon_lockslow,
TxStat.txLockAlloc,
TxStat.txLockAlloc_freelock);
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_txstats_proc_show, NULL);
}
+
+const struct file_operations jfs_txstats_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_txstats_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5a61ebf2cbcc..ae3acafb447b 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -20,7 +20,9 @@
*/
#include <linux/fs.h>
+#include <linux/module.h>
#include <linux/quotaops.h>
+#include <linux/seq_file.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
#include "jfs_metapage.h"
@@ -4134,13 +4136,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
}
#ifdef CONFIG_JFS_STATISTICS
-int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
-
- len += sprintf(buffer,
+ seq_printf(m,
"JFS Xtree statistics\n"
"====================\n"
"searches = %d\n"
@@ -4149,19 +4147,19 @@ int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
xtStat.search,
xtStat.fastSearch,
xtStat.split);
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_xtstat_proc_show, NULL);
}
+
+const struct file_operations jfs_xtstat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_xtstat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 0ba6778edaa2..2aba82386810 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1455,7 +1455,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
free_UCSname(&key);
if (rc == -ENOENT) {
d_add(dentry, NULL);
- return ERR_PTR(0);
+ return NULL;
} else if (rc) {
jfs_err("jfs_lookup: dtSearch returned %d", rc);
return ERR_PTR(rc);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 50ea65451732..0288e6d7936a 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -499,7 +499,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
inode = jfs_iget(sb, ROOT_I);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- goto out_no_root;
+ goto out_no_rw;
}
sb->s_root = d_alloc_root(inode);
if (!sb->s_root)
@@ -521,9 +521,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
return 0;
out_no_root:
- jfs_err("jfs_read_super: get root inode failed");
- if (inode)
- iput(inode);
+ jfs_err("jfs_read_super: get root dentry failed");
+ iput(inode);
out_no_rw:
rc = jfs_umount(sb);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 5df517b81f3f..1f6dc518505c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -224,7 +224,9 @@ void nlm_release_call(struct nlm_rqst *call)
static void nlmclnt_rpc_release(void *data)
{
+ lock_kernel();
nlm_release_call(data);
+ unlock_kernel();
}
static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -430,7 +432,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
* Report the conflicting lock back to the application.
*/
fl->fl_start = req->a_res.lock.fl.fl_start;
- fl->fl_end = req->a_res.lock.fl.fl_start;
+ fl->fl_end = req->a_res.lock.fl.fl_end;
fl->fl_type = req->a_res.lock.fl.fl_type;
fl->fl_pid = 0;
break;
@@ -710,7 +712,9 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
die:
return;
retry_rebind:
+ lock_kernel();
nlm_rebind_host(req->a_host);
+ unlock_kernel();
retry_unlock:
rpc_restart_call(task);
}
@@ -788,7 +792,9 @@ retry_cancel:
/* Don't ever retry more than 3 times */
if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
goto die;
+ lock_kernel();
nlm_rebind_host(req->a_host);
+ unlock_kernel();
rpc_restart_call(task);
rpc_delay(task, 30 * HZ);
}
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 385437e3387d..2e27176ff42f 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -248,7 +248,9 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
static void nlm4svc_callback_release(void *data)
{
+ lock_kernel();
nlm_release_call(data);
+ unlock_kernel();
}
static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 81aca859bfde..56a08ab9a4cb 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -795,6 +795,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
dprintk("lockd: GRANT_MSG RPC callback\n");
+ lock_kernel();
/* if the block is not on a list at this point then it has
* been invalidated. Don't try to requeue it.
*
@@ -804,7 +805,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
* for nlm_blocked?
*/
if (list_empty(&block->b_list))
- return;
+ goto out;
/* Technically, we should down the file semaphore here. Since we
* move the block towards the head of the queue only, no harm
@@ -818,13 +819,17 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
}
nlmsvc_insert_block(block, timeout);
svc_wake_up(block->b_daemon);
+out:
+ unlock_kernel();
}
static void nlmsvc_grant_release(void *data)
{
struct nlm_rqst *call = data;
+ lock_kernel();
nlmsvc_release_block(call->a_block);
+ unlock_kernel();
}
static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 88379cc6e0b1..ce6952b50a75 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -278,7 +278,9 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
static void nlmsvc_callback_release(void *data)
{
+ lock_kernel();
nlm_release_call(data);
+ unlock_kernel();
}
static const struct rpc_call_ops nlmsvc_callback_ops = {
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a8..dbcc7af76a15 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
bio_put(bio);
}
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
bio->bi_end_io = mpage_end_io_read;
if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
submit_bio(rw, bio);
return NULL;
}
+EXPORT_SYMBOL(mpage_bio_submit);
static struct bio *
mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
* written, so it can intelligently allocate a suitably-sized BIO. For now,
* just allocate full-size (16-page) BIOs.
*/
-struct mpage_data {
- struct bio *bio;
- sector_t last_block_in_bio;
- get_block_t *get_block;
- unsigned use_writepage;
-};
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
{
struct mpage_data *mpd = data;
struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
mpd->bio = bio;
return ret;
}
+EXPORT_SYMBOL(__mpage_writepage);
/**
* mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 05ff4f1d7026..1f7f2956412a 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -214,7 +214,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
dentry->d_op = &msdos_dentry_operations;
- lock_kernel();
+ lock_super(sb);
res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
if (res == -ENOENT)
goto add;
@@ -232,7 +232,7 @@ add:
if (dentry)
dentry->d_op = &msdos_dentry_operations;
out:
- unlock_kernel();
+ unlock_super(sb);
if (!res)
return dentry;
return ERR_PTR(res);
@@ -286,7 +286,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
unsigned char msdos_name[MSDOS_NAME];
int err, is_hid;
- lock_kernel();
+ lock_super(sb);
err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
msdos_name, &MSDOS_SB(sb)->options);
@@ -315,7 +315,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
d_instantiate(dentry, inode);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
err = fat_flush_inodes(sb, dir, inode);
return err;
@@ -324,11 +324,12 @@ out:
/***** Remove a directory */
static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
{
+ struct super_block *sb = dir->i_sb;
struct inode *inode = dentry->d_inode;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
/*
* Check whether the directory is not in use, then check
* whether it is empty.
@@ -349,9 +350,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_ctime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
- err = fat_flush_inodes(inode->i_sb, dir, inode);
+ err = fat_flush_inodes(sb, dir, inode);
return err;
}
@@ -366,7 +367,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct timespec ts;
int err, is_hid, cluster;
- lock_kernel();
+ lock_super(sb);
err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
msdos_name, &MSDOS_SB(sb)->options);
@@ -404,14 +405,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
d_instantiate(dentry, inode);
- unlock_kernel();
+ unlock_super(sb);
fat_flush_inodes(sb, dir, inode);
return 0;
out_free:
fat_free_clusters(dir, cluster);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -419,10 +420,11 @@ out:
static int msdos_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ struct super_block *sb= inode->i_sb;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
if (err)
goto out;
@@ -434,9 +436,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
inode->i_ctime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
- err = fat_flush_inodes(inode->i_sb, dir, inode);
+ err = fat_flush_inodes(sb, dir, inode);
return err;
}
@@ -618,10 +620,11 @@ error_inode:
static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
+ struct super_block *sb = old_dir->i_sb;
unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
int err, is_hid;
- lock_kernel();
+ lock_super(sb);
err = msdos_format_name(old_dentry->d_name.name,
old_dentry->d_name.len, old_msdos_name,
@@ -640,9 +643,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
new_dir, new_msdos_name, new_dentry, is_hid);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
- err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir);
+ err = fat_flush_inodes(sb, old_dir, new_dir);
return err;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fc302c2a0e0..4f6f7635b59c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -750,7 +750,7 @@ struct proc_fs_info {
const char *str;
};
-static void show_sb_opts(struct seq_file *m, struct super_block *sb)
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
{
static const struct proc_fs_info fs_info[] = {
{ MS_SYNCHRONOUS, ",sync" },
@@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb)
if (sb->s_flags & fs_infop->flag)
seq_puts(m, fs_infop->str);
}
+
+ return security_sb_show_options(m, sb);
}
static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
@@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
seq_putc(m, ' ');
show_type(m, mnt->mnt_sb);
seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
- show_sb_opts(m, mnt->mnt_sb);
+ err = show_sb_opts(m, mnt->mnt_sb);
+ if (err)
+ goto out;
show_mnt_opts(m, mnt);
if (mnt->mnt_sb->s_op->show_options)
err = mnt->mnt_sb->s_op->show_options(m, mnt);
seq_puts(m, " 0 0\n");
+out:
return err;
}
@@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
seq_putc(m, ' ');
mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
- show_sb_opts(m, sb);
+ err = show_sb_opts(m, sb);
+ if (err)
+ goto out;
if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt);
seq_putc(m, '\n');
+out:
return err;
}
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 2b145de45b39..6a7d901f1936 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
+#include <linux/smp_lock.h>
#include <linux/ncp_fs.h>
#include "ncplib_kernel.h"
@@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) {
return 0;
}
+static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t ret;
+ lock_kernel();
+ ret = generic_file_llseek_unlocked(file, offset, origin);
+ unlock_kernel();
+ return ret;
+}
+
const struct file_operations ncp_file_operations =
{
- .llseek = remote_llseek,
+ .llseek = ncp_remote_llseek,
.read = ncp_file_read,
.write = ncp_file_write,
.ioctl = ncp_ioctl,
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c1e7c8300629..f447f4b4476c 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -27,7 +27,7 @@
struct nfs_callback_data {
unsigned int users;
- struct svc_serv *serv;
+ struct svc_rqst *rqst;
struct task_struct *task;
};
@@ -91,21 +91,17 @@ nfs_callback_svc(void *vrqstp)
svc_process(rqstp);
}
unlock_kernel();
- nfs_callback_info.task = NULL;
- svc_exit_thread(rqstp);
return 0;
}
/*
- * Bring up the server process if it is not already up.
+ * Bring up the callback thread if it is not already up.
*/
int nfs_callback_up(void)
{
struct svc_serv *serv = NULL;
- struct svc_rqst *rqstp;
int ret = 0;
- lock_kernel();
mutex_lock(&nfs_callback_mutex);
if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
goto out;
@@ -121,22 +117,23 @@ int nfs_callback_up(void)
nfs_callback_tcpport = ret;
dprintk("Callback port = 0x%x\n", nfs_callback_tcpport);
- rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
- if (IS_ERR(rqstp)) {
- ret = PTR_ERR(rqstp);
+ nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+ if (IS_ERR(nfs_callback_info.rqst)) {
+ ret = PTR_ERR(nfs_callback_info.rqst);
+ nfs_callback_info.rqst = NULL;
goto out_err;
}
svc_sock_update_bufs(serv);
- nfs_callback_info.serv = serv;
- nfs_callback_info.task = kthread_run(nfs_callback_svc, rqstp,
+ nfs_callback_info.task = kthread_run(nfs_callback_svc,
+ nfs_callback_info.rqst,
"nfsv4-svc");
if (IS_ERR(nfs_callback_info.task)) {
ret = PTR_ERR(nfs_callback_info.task);
- nfs_callback_info.serv = NULL;
+ svc_exit_thread(nfs_callback_info.rqst);
+ nfs_callback_info.rqst = NULL;
nfs_callback_info.task = NULL;
- svc_exit_thread(rqstp);
goto out_err;
}
out:
@@ -149,7 +146,6 @@ out:
if (serv)
svc_destroy(serv);
mutex_unlock(&nfs_callback_mutex);
- unlock_kernel();
return ret;
out_err:
dprintk("Couldn't create callback socket or server thread; err = %d\n",
@@ -159,17 +155,19 @@ out_err:
}
/*
- * Kill the server process if it is not already down.
+ * Kill the callback thread if it's no longer being used.
*/
void nfs_callback_down(void)
{
- lock_kernel();
mutex_lock(&nfs_callback_mutex);
nfs_callback_info.users--;
- if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL)
+ if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) {
kthread_stop(nfs_callback_info.task);
+ svc_exit_thread(nfs_callback_info.rqst);
+ nfs_callback_info.rqst = NULL;
+ nfs_callback_info.task = NULL;
+ }
mutex_unlock(&nfs_callback_mutex);
- unlock_kernel();
}
static int nfs_callback_authenticate(struct svc_rqst *rqstp)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f2a092ca69b5..5ee23e7058b3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -431,14 +431,14 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
{
to->to_initval = timeo * HZ / 10;
to->to_retries = retrans;
- if (!to->to_retries)
- to->to_retries = 2;
switch (proto) {
case XPRT_TRANSPORT_TCP:
case XPRT_TRANSPORT_RDMA:
+ if (to->to_retries == 0)
+ to->to_retries = NFS_DEF_TCP_RETRANS;
if (to->to_initval == 0)
- to->to_initval = 60 * HZ;
+ to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
to->to_initval = NFS_MAX_TCP_TIMEOUT;
to->to_increment = to->to_initval;
@@ -450,14 +450,17 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
to->to_exponential = 0;
break;
case XPRT_TRANSPORT_UDP:
- default:
+ if (to->to_retries == 0)
+ to->to_retries = NFS_DEF_UDP_RETRANS;
if (!to->to_initval)
- to->to_initval = 11 * HZ / 10;
+ to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
to->to_initval = NFS_MAX_UDP_TIMEOUT;
to->to_maxval = NFS_MAX_UDP_TIMEOUT;
to->to_exponential = 1;
break;
+ default:
+ BUG();
}
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 982a2064fe4c..28a238dab23a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -133,13 +133,14 @@ nfs_opendir(struct inode *inode, struct file *filp)
{
int res;
- dfprintk(VFS, "NFS: opendir(%s/%ld)\n",
- inode->i_sb->s_id, inode->i_ino);
+ dfprintk(FILE, "NFS: open dir(%s/%s)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name);
+
+ nfs_inc_stats(inode, NFSIOS_VFSOPEN);
- lock_kernel();
/* Call generic open code in order to cache credentials */
res = nfs_open(inode, filp);
- unlock_kernel();
return res;
}
@@ -528,13 +529,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
struct nfs_fattr fattr;
long res;
- dfprintk(VFS, "NFS: readdir(%s/%s) starting at cookie %Lu\n",
+ dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(long long)filp->f_pos);
nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
- lock_kernel();
-
/*
* filp->f_pos points to the dirent entry number.
* *desc->dir_cookie has the cookie for the next entry. We have
@@ -592,10 +591,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
out:
nfs_unblock_sillyrename(dentry);
- unlock_kernel();
if (res > 0)
res = 0;
- dfprintk(VFS, "NFS: readdir(%s/%s) returns %ld\n",
+ dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
res);
return res;
@@ -603,7 +601,15 @@ out:
static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
{
- mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+
+ dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
+ dentry->d_parent->d_name.name,
+ dentry->d_name.name,
+ offset, origin);
+
+ mutex_lock(&inode->i_mutex);
switch (origin) {
case 1:
offset += filp->f_pos;
@@ -619,7 +625,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
nfs_file_open_context(filp)->dir_cookie = 0;
}
out:
- mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
+ mutex_unlock(&inode->i_mutex);
return offset;
}
@@ -629,10 +635,11 @@ out:
*/
static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
{
- dfprintk(VFS, "NFS: fsync_dir(%s/%s) datasync %d\n",
+ dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
datasync);
+ nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
return 0;
}
@@ -767,7 +774,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
struct nfs_fattr fattr;
parent = dget_parent(dentry);
- lock_kernel();
dir = parent->d_inode;
nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
inode = dentry->d_inode;
@@ -805,7 +811,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_valid:
- unlock_kernel();
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
__func__, dentry->d_parent->d_name.name,
@@ -824,7 +829,6 @@ out_zap_parent:
shrink_dcache_parent(dentry);
}
d_drop(dentry);
- unlock_kernel();
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
__func__, dentry->d_parent->d_name.name,
@@ -858,6 +862,14 @@ static int nfs_dentry_delete(struct dentry *dentry)
}
+static void nfs_drop_nlink(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ if (inode->i_nlink > 0)
+ drop_nlink(inode);
+ spin_unlock(&inode->i_lock);
+}
+
/*
* Called when the dentry loses inode.
* We use it to clean up silly-renamed files.
@@ -869,10 +881,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
- lock_kernel();
drop_nlink(inode);
nfs_complete_unlink(dentry, inode);
- unlock_kernel();
}
iput(inode);
}
@@ -903,8 +913,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
res = ERR_PTR(-ENOMEM);
dentry->d_op = NFS_PROTO(dir)->dentry_ops;
- lock_kernel();
-
/*
* If we're doing an exclusive create, optimize away the lookup
* but don't hash the dentry.
@@ -912,7 +920,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
if (nfs_is_exclusive_create(dir, nd)) {
d_instantiate(dentry, NULL);
res = NULL;
- goto out_unlock;
+ goto out;
}
parent = dentry->d_parent;
@@ -940,8 +948,6 @@ no_entry:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_unblock_sillyrename:
nfs_unblock_sillyrename(parent);
-out_unlock:
- unlock_kernel();
out:
return res;
}
@@ -999,9 +1005,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
}
/* Open the file on the server */
- lock_kernel();
res = nfs4_atomic_open(dir, dentry, nd);
- unlock_kernel();
if (IS_ERR(res)) {
error = PTR_ERR(res);
switch (error) {
@@ -1063,9 +1067,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
* operations that change the directory. We therefore save the
* change attribute *before* we do the RPC call.
*/
- lock_kernel();
ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
- unlock_kernel();
out:
dput(parent);
if (!ret)
@@ -1218,14 +1220,11 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
if ((nd->flags & LOOKUP_CREATE) != 0)
open_flags = nd->intent.open.flags;
- lock_kernel();
error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
if (error != 0)
goto out_err;
- unlock_kernel();
return 0;
out_err:
- unlock_kernel();
d_drop(dentry);
return error;
}
@@ -1248,14 +1247,11 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
- lock_kernel();
status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
if (status != 0)
goto out_err;
- unlock_kernel();
return 0;
out_err:
- unlock_kernel();
d_drop(dentry);
return status;
}
@@ -1274,15 +1270,12 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
attr.ia_valid = ATTR_MODE;
attr.ia_mode = mode | S_IFDIR;
- lock_kernel();
error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
if (error != 0)
goto out_err;
- unlock_kernel();
return 0;
out_err:
d_drop(dentry);
- unlock_kernel();
return error;
}
@@ -1299,14 +1292,12 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
- lock_kernel();
error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
/* Ensure the VFS deletes this inode */
if (error == 0 && dentry->d_inode != NULL)
clear_nlink(dentry->d_inode);
else if (error == -ENOENT)
nfs_dentry_handle_enoent(dentry);
- unlock_kernel();
return error;
}
@@ -1408,7 +1399,7 @@ static int nfs_safe_remove(struct dentry *dentry)
error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
/* The VFS may want to delete this inode */
if (error == 0)
- drop_nlink(inode);
+ nfs_drop_nlink(inode);
nfs_mark_for_revalidate(inode);
} else
error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
@@ -1431,7 +1422,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
dir->i_ino, dentry->d_name.name);
- lock_kernel();
spin_lock(&dcache_lock);
spin_lock(&dentry->d_lock);
if (atomic_read(&dentry->d_count) > 1) {
@@ -1440,7 +1430,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
/* Start asynchronous writeout of the inode */
write_inode_now(dentry->d_inode, 0);
error = nfs_sillyrename(dir, dentry);
- unlock_kernel();
return error;
}
if (!d_unhashed(dentry)) {
@@ -1454,7 +1443,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
} else if (need_rehash)
d_rehash(dentry);
- unlock_kernel();
return error;
}
@@ -1491,13 +1479,9 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
attr.ia_mode = S_IFLNK | S_IRWXUGO;
attr.ia_valid = ATTR_MODE;
- lock_kernel();
-
page = alloc_page(GFP_HIGHUSER);
- if (!page) {
- unlock_kernel();
+ if (!page)
return -ENOMEM;
- }
kaddr = kmap_atomic(page, KM_USER0);
memcpy(kaddr, symname, pathlen);
@@ -1512,7 +1496,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
dentry->d_name.name, symname, error);
d_drop(dentry);
__free_page(page);
- unlock_kernel();
return error;
}
@@ -1530,7 +1513,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
} else
__free_page(page);
- unlock_kernel();
return 0;
}
@@ -1544,14 +1526,12 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
dentry->d_parent->d_name.name, dentry->d_name.name);
- lock_kernel();
d_drop(dentry);
error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
if (error == 0) {
atomic_inc(&inode->i_count);
d_add(dentry, inode);
}
- unlock_kernel();
return error;
}
@@ -1591,7 +1571,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* To prevent any new references to the target during the rename,
* we unhash the dentry and free the inode in advance.
*/
- lock_kernel();
if (!d_unhashed(new_dentry)) {
d_drop(new_dentry);
rehash = new_dentry;
@@ -1635,7 +1614,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* dentry still busy? */
goto out;
} else
- drop_nlink(new_inode);
+ nfs_drop_nlink(new_inode);
go_ahead:
/*
@@ -1669,7 +1648,6 @@ out:
/* new dentry created? */
if (dentry)
dput(dentry);
- unlock_kernel();
return error;
}
@@ -1962,8 +1940,6 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
}
force_lookup:
- lock_kernel();
-
if (!NFS_PROTO(inode)->access)
goto out_notsup;
@@ -1973,7 +1949,6 @@ force_lookup:
put_rpccred(cred);
} else
res = PTR_ERR(cred);
- unlock_kernel();
out:
dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
inode->i_sb->s_id, inode->i_ino, mask, res);
@@ -1982,7 +1957,6 @@ out_notsup:
res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
if (res == 0)
res = generic_permission(inode, mask, NULL);
- unlock_kernel();
goto out;
}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4757a2b326a1..08f6b040d289 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -890,7 +890,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
count = iov_length(iov, nr_segs);
nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
- dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n",
+ dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name,
count, (long long) pos);
@@ -947,7 +947,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
count = iov_length(iov, nr_segs);
nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
- dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n",
+ dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name,
count, (long long) pos);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d84a3d8f32af..78460657f5cb 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -50,7 +50,7 @@ static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
static int nfs_file_flush(struct file *, fl_owner_t id);
-static int nfs_fsync(struct file *, struct dentry *dentry, int datasync);
+static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
static int nfs_check_flags(int flags);
static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -72,7 +72,7 @@ const struct file_operations nfs_file_operations = {
.open = nfs_file_open,
.flush = nfs_file_flush,
.release = nfs_file_release,
- .fsync = nfs_fsync,
+ .fsync = nfs_file_fsync,
.lock = nfs_lock,
.flock = nfs_flock,
.splice_read = nfs_file_splice_read,
@@ -119,25 +119,33 @@ nfs_file_open(struct inode *inode, struct file *filp)
{
int res;
+ dprintk("NFS: open file(%s/%s)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name);
+
res = nfs_check_flags(filp->f_flags);
if (res)
return res;
nfs_inc_stats(inode, NFSIOS_VFSOPEN);
- lock_kernel();
- res = NFS_PROTO(inode)->file_open(inode, filp);
- unlock_kernel();
+ res = nfs_open(inode, filp);
return res;
}
static int
nfs_file_release(struct inode *inode, struct file *filp)
{
+ struct dentry *dentry = filp->f_path.dentry;
+
+ dprintk("NFS: release(%s/%s)\n",
+ dentry->d_parent->d_name.name,
+ dentry->d_name.name);
+
/* Ensure that dirty pages are flushed out with the right creds */
if (filp->f_mode & FMODE_WRITE)
- nfs_wb_all(filp->f_path.dentry->d_inode);
+ nfs_wb_all(dentry->d_inode);
nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
- return NFS_PROTO(inode)->file_release(inode, filp);
+ return nfs_release(inode, filp);
}
/**
@@ -170,6 +178,13 @@ force_reval:
static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
{
+ loff_t loff;
+
+ dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name,
+ offset, origin);
+
/* origin == SEEK_END => we must revalidate the cached file length */
if (origin == SEEK_END) {
struct inode *inode = filp->f_mapping->host;
@@ -177,11 +192,14 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
if (retval < 0)
return (loff_t)retval;
}
- return remote_llseek(filp, offset, origin);
+ lock_kernel(); /* BKL needed? */
+ loff = generic_file_llseek_unlocked(filp, offset, origin);
+ unlock_kernel();
+ return loff;
}
/*
- * Helper for nfs_file_flush() and nfs_fsync()
+ * Helper for nfs_file_flush() and nfs_file_fsync()
*
* Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
* disk, but it retrieves and clears ctx->error after synching, despite
@@ -207,16 +225,18 @@ static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
/*
* Flush all dirty pages, and check for write errors.
- *
*/
static int
nfs_file_flush(struct file *file, fl_owner_t id)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
int status;
- dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+ dprintk("NFS: flush(%s/%s)\n",
+ dentry->d_parent->d_name.name,
+ dentry->d_name.name);
if ((file->f_mode & FMODE_WRITE) == 0)
return 0;
@@ -241,7 +261,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
if (iocb->ki_filp->f_flags & O_DIRECT)
return nfs_file_direct_read(iocb, iov, nr_segs, pos);
- dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n",
+ dprintk("NFS: read(%s/%s, %lu@%lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(unsigned long) count, (unsigned long) pos);
@@ -261,7 +281,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
struct inode *inode = dentry->d_inode;
ssize_t res;
- dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n",
+ dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(unsigned long) count, (unsigned long long) *ppos);
@@ -278,7 +298,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
struct inode *inode = dentry->d_inode;
int status;
- dfprintk(VFS, "nfs: mmap(%s/%s)\n",
+ dprintk("NFS: mmap(%s/%s)\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
status = nfs_revalidate_mapping(inode, file->f_mapping);
@@ -296,12 +316,14 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
* whether any write errors occurred for this process.
*/
static int
-nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct inode *inode = dentry->d_inode;
- dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+ dprintk("NFS: fsync file(%s/%s) datasync %d\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ datasync);
nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
return nfs_do_fsync(ctx, inode);
@@ -324,6 +346,11 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
struct page *page;
index = pos >> PAGE_CACHE_SHIFT;
+ dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name,
+ mapping->host->i_ino, len, (long long) pos);
+
page = __grab_cache_page(mapping, index);
if (!page)
return -ENOMEM;
@@ -344,9 +371,32 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
int status;
- lock_kernel();
+ dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name,
+ mapping->host->i_ino, len, (long long) pos);
+
+ /*
+ * Zero any uninitialised parts of the page, and then mark the page
+ * as up to date if it turns out that we're extending the file.
+ */
+ if (!PageUptodate(page)) {
+ unsigned pglen = nfs_page_length(page);
+ unsigned end = offset + len;
+
+ if (pglen == 0) {
+ zero_user_segments(page, 0, offset,
+ end, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ } else if (end >= pglen) {
+ zero_user_segment(page, end, PAGE_CACHE_SIZE);
+ if (offset == 0)
+ SetPageUptodate(page);
+ } else
+ zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+ }
+
status = nfs_updatepage(file, page, offset, copied);
- unlock_kernel();
unlock_page(page);
page_cache_release(page);
@@ -358,6 +408,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
static void nfs_invalidate_page(struct page *page, unsigned long offset)
{
+ dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+
if (offset != 0)
return;
/* Cancel any unstarted writes on this page */
@@ -366,13 +418,20 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
static int nfs_release_page(struct page *page, gfp_t gfp)
{
+ dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
+
/* If PagePrivate() is set, then the page is not freeable */
return 0;
}
static int nfs_launder_page(struct page *page)
{
- return nfs_wb_page(page->mapping->host, page);
+ struct inode *inode = page->mapping->host;
+
+ dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
+ inode->i_ino, (long long)page_offset(page));
+
+ return nfs_wb_page(inode, page);
}
const struct address_space_operations nfs_file_aops = {
@@ -392,13 +451,19 @@ const struct address_space_operations nfs_file_aops = {
static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
{
struct file *filp = vma->vm_file;
+ struct dentry *dentry = filp->f_path.dentry;
unsigned pagelen;
int ret = -EINVAL;
struct address_space *mapping;
+ dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ filp->f_mapping->host->i_ino,
+ (long long)page_offset(page));
+
lock_page(page);
mapping = page->mapping;
- if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
+ if (mapping != dentry->d_inode->i_mapping)
goto out_unlock;
ret = 0;
@@ -446,9 +511,9 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
if (iocb->ki_filp->f_flags & O_DIRECT)
return nfs_file_direct_write(iocb, iov, nr_segs, pos);
- dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n",
+ dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- inode->i_ino, (unsigned long) count, (long long) pos);
+ (unsigned long) count, (long long) pos);
result = -EBUSY;
if (IS_SWAPFILE(inode))
@@ -582,7 +647,8 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
* This makes locking act as a cache coherency point.
*/
nfs_sync_mapping(filp->f_mapping);
- nfs_zap_caches(inode);
+ if (!nfs_have_delegation(inode, FMODE_READ))
+ nfs_zap_caches(inode);
out:
return status;
}
@@ -592,23 +658,35 @@ out:
*/
static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
{
- struct inode * inode = filp->f_mapping->host;
+ struct inode *inode = filp->f_mapping->host;
+ int ret = -ENOLCK;
- dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n",
- inode->i_sb->s_id, inode->i_ino,
+ dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name,
fl->fl_type, fl->fl_flags,
(long long)fl->fl_start, (long long)fl->fl_end);
+
nfs_inc_stats(inode, NFSIOS_VFSLOCK);
/* No mandatory locks over NFS */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
+ goto out_err;
+
+ if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
+ ret = NFS_PROTO(inode)->lock_check_bounds(fl);
+ if (ret < 0)
+ goto out_err;
+ }
if (IS_GETLK(cmd))
- return do_getlk(filp, cmd, fl);
- if (fl->fl_type == F_UNLCK)
- return do_unlk(filp, cmd, fl);
- return do_setlk(filp, cmd, fl);
+ ret = do_getlk(filp, cmd, fl);
+ else if (fl->fl_type == F_UNLCK)
+ ret = do_unlk(filp, cmd, fl);
+ else
+ ret = do_setlk(filp, cmd, fl);
+out_err:
+ return ret;
}
/*
@@ -616,9 +694,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
*/
static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
{
- dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n",
- filp->f_path.dentry->d_inode->i_sb->s_id,
- filp->f_path.dentry->d_inode->i_ino,
+ dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name,
fl->fl_type, fl->fl_flags);
/*
@@ -641,12 +719,15 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
return do_setlk(filp, cmd, fl);
}
+/*
+ * There is no protocol support for leases, so we have no way to implement
+ * them correctly in the face of opens by other clients.
+ */
static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
{
- /*
- * There is no protocol support for leases, so we have no way
- * to implement them correctly in the face of opens by other
- * clients.
- */
+ dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name, arg);
+
return -EINVAL;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 596c5d8e86f4..df23f987da6b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -57,8 +57,6 @@ static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
static void nfs_invalidate_inode(struct inode *);
static int nfs_update_inode(struct inode *, struct nfs_fattr *);
-static void nfs_zap_acl_cache(struct inode *);
-
static struct kmem_cache * nfs_inode_cachep;
static inline unsigned long
@@ -167,7 +165,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
}
}
-static void nfs_zap_acl_cache(struct inode *inode)
+void nfs_zap_acl_cache(struct inode *inode)
{
void (*clear_acl_cache)(struct inode *);
@@ -347,7 +345,7 @@ out_no_inode:
goto out;
}
-#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET)
+#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE)
int
nfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -369,10 +367,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
/* Optimization: if the end result is no change, don't RPC */
attr->ia_valid &= NFS_VALID_ATTRS;
- if (attr->ia_valid == 0)
+ if ((attr->ia_valid & ~ATTR_FILE) == 0)
return 0;
- lock_kernel();
/* Write all dirty data */
if (S_ISREG(inode->i_mode)) {
filemap_write_and_wait(inode->i_mapping);
@@ -386,11 +383,66 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
if (error == 0)
nfs_refresh_inode(inode, &fattr);
- unlock_kernel();
return error;
}
/**
+ * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * This is a copy of the common vmtruncate, but with the locking
+ * corrected to take into account the fact that NFS requires
+ * inode->i_size to be updated under the inode->i_lock.
+ */
+static int nfs_vmtruncate(struct inode * inode, loff_t offset)
+{
+ if (i_size_read(inode) < offset) {
+ unsigned long limit;
+
+ limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ if (limit != RLIM_INFINITY && offset > limit)
+ goto out_sig;
+ if (offset > inode->i_sb->s_maxbytes)
+ goto out_big;
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, offset);
+ spin_unlock(&inode->i_lock);
+ } else {
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * truncation of in-use swapfiles is disallowed - it would
+ * cause subsequent swapout to scribble on the now-freed
+ * blocks.
+ */
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, offset);
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * unmap_mapping_range is called twice, first simply for
+ * efficiency so that truncate_inode_pages does fewer
+ * single-page unmaps. However after this first call, and
+ * before truncate_inode_pages finishes, it is possible for
+ * private pages to be COWed, which remain after
+ * truncate_inode_pages finishes, hence the second
+ * unmap_mapping_range call must be made for correctness.
+ */
+ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(mapping, offset);
+ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+ }
+ return 0;
+out_sig:
+ send_sig(SIGXFSZ, current, 0);
+out_big:
+ return -EFBIG;
+}
+
+/**
* nfs_setattr_update_inode - Update inode metadata after a setattr call.
* @inode: pointer to struct inode
* @attr: pointer to struct iattr
@@ -416,8 +468,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
}
if ((attr->ia_valid & ATTR_SIZE) != 0) {
nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
- inode->i_size = attr->ia_size;
- vmtruncate(inode, attr->ia_size);
+ nfs_vmtruncate(inode, attr->ia_size);
}
}
@@ -647,7 +698,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
inode->i_sb->s_id, (long long)NFS_FILEID(inode));
nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
- lock_kernel();
if (is_bad_inode(inode))
goto out_nowait;
if (NFS_STALE(inode))
@@ -696,7 +746,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
nfs_wake_up_inode(inode);
out_nowait:
- unlock_kernel();
return status;
}
@@ -831,9 +880,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (S_ISDIR(inode->i_mode))
nfsi->cache_validity |= NFS_INO_INVALID_DATA;
}
- if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) &&
+ if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
nfsi->npages == 0)
- inode->i_size = nfs_size_to_loff_t(fattr->size);
+ i_size_write(inode, nfs_size_to_loff_t(fattr->size));
}
}
@@ -974,7 +1023,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
(fattr->valid & NFS_ATTR_WCC) == 0) {
memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
- fattr->pre_size = inode->i_size;
+ fattr->pre_size = i_size_read(inode);
fattr->valid |= NFS_ATTR_WCC;
}
return nfs_post_op_update_inode(inode, fattr);
@@ -1059,7 +1108,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Do we perhaps have any outstanding writes, or has
* the file grown beyond our last write? */
if (nfsi->npages == 0 || new_isize > cur_isize) {
- inode->i_size = new_isize;
+ i_size_write(inode, new_isize);
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
}
dprintk("NFS: isize change on server for file %s/%ld\n",
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 04ae867dddba..24241fcbb98d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -150,6 +150,7 @@ extern void nfs_clear_inode(struct inode *);
#ifdef CONFIG_NFS_V4
extern void nfs4_clear_inode(struct inode *);
#endif
+void nfs_zap_acl_cache(struct inode *inode);
/* super.c */
extern struct file_system_type nfs_xdev_fs_type;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 6350ecbde589..a36952810032 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -5,135 +5,41 @@
*
* Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
*
- * NFS client per-mount statistics provide information about the health of
- * the NFS client and the health of each NFS mount point. Generally these
- * are not for detailed problem diagnosis, but simply to indicate that there
- * is a problem.
- *
- * These counters are not meant to be human-readable, but are meant to be
- * integrated into system monitoring tools such as "sar" and "iostat". As
- * such, the counters are sampled by the tools over time, and are never
- * zeroed after a file system is mounted. Moving averages can be computed
- * by the tools by taking the difference between two instantaneous samples
- * and dividing that by the time between the samples.
*/
#ifndef _NFS_IOSTAT
#define _NFS_IOSTAT
-#define NFS_IOSTAT_VERS "1.0"
-
-/*
- * NFS byte counters
- *
- * 1. SERVER - the number of payload bytes read from or written to the
- * server by the NFS client via an NFS READ or WRITE request.
- *
- * 2. NORMAL - the number of bytes read or written by applications via
- * the read(2) and write(2) system call interfaces.
- *
- * 3. DIRECT - the number of bytes read or written from files opened
- * with the O_DIRECT flag.
- *
- * These counters give a view of the data throughput into and out of the NFS
- * client. Comparing the number of bytes requested by an application with the
- * number of bytes the client requests from the server can provide an
- * indication of client efficiency (per-op, cache hits, etc).
- *
- * These counters can also help characterize which access methods are in
- * use. DIRECT by itself shows whether there is any O_DIRECT traffic.
- * NORMAL + DIRECT shows how much data is going through the system call
- * interface. A large amount of SERVER traffic without much NORMAL or
- * DIRECT traffic shows that applications are using mapped files.
- *
- * NFS page counters
- *
- * These count the number of pages read or written via nfs_readpage(),
- * nfs_readpages(), or their write equivalents.
- */
-enum nfs_stat_bytecounters {
- NFSIOS_NORMALREADBYTES = 0,
- NFSIOS_NORMALWRITTENBYTES,
- NFSIOS_DIRECTREADBYTES,
- NFSIOS_DIRECTWRITTENBYTES,
- NFSIOS_SERVERREADBYTES,
- NFSIOS_SERVERWRITTENBYTES,
- NFSIOS_READPAGES,
- NFSIOS_WRITEPAGES,
- __NFSIOS_BYTESMAX,
-};
-
-/*
- * NFS event counters
- *
- * These counters provide a low-overhead way of monitoring client activity
- * without enabling NFS trace debugging. The counters show the rate at
- * which VFS requests are made, and how often the client invalidates its
- * data and attribute caches. This allows system administrators to monitor
- * such things as how close-to-open is working, and answer questions such
- * as "why are there so many GETATTR requests on the wire?"
- *
- * They also count anamolous events such as short reads and writes, silly
- * renames due to close-after-delete, and operations that change the size
- * of a file (such operations can often be the source of data corruption
- * if applications aren't using file locking properly).
- */
-enum nfs_stat_eventcounters {
- NFSIOS_INODEREVALIDATE = 0,
- NFSIOS_DENTRYREVALIDATE,
- NFSIOS_DATAINVALIDATE,
- NFSIOS_ATTRINVALIDATE,
- NFSIOS_VFSOPEN,
- NFSIOS_VFSLOOKUP,
- NFSIOS_VFSACCESS,
- NFSIOS_VFSUPDATEPAGE,
- NFSIOS_VFSREADPAGE,
- NFSIOS_VFSREADPAGES,
- NFSIOS_VFSWRITEPAGE,
- NFSIOS_VFSWRITEPAGES,
- NFSIOS_VFSGETDENTS,
- NFSIOS_VFSSETATTR,
- NFSIOS_VFSFLUSH,
- NFSIOS_VFSFSYNC,
- NFSIOS_VFSLOCK,
- NFSIOS_VFSRELEASE,
- NFSIOS_CONGESTIONWAIT,
- NFSIOS_SETATTRTRUNC,
- NFSIOS_EXTENDWRITE,
- NFSIOS_SILLYRENAME,
- NFSIOS_SHORTREAD,
- NFSIOS_SHORTWRITE,
- NFSIOS_DELAY,
- __NFSIOS_COUNTSMAX,
-};
-
-#ifdef __KERNEL__
-
#include <linux/percpu.h>
#include <linux/cache.h>
+#include <linux/nfs_iostat.h>
struct nfs_iostats {
unsigned long long bytes[__NFSIOS_BYTESMAX];
unsigned long events[__NFSIOS_COUNTSMAX];
} ____cacheline_aligned;
-static inline void nfs_inc_server_stats(struct nfs_server *server, enum nfs_stat_eventcounters stat)
+static inline void nfs_inc_server_stats(const struct nfs_server *server,
+ enum nfs_stat_eventcounters stat)
{
struct nfs_iostats *iostats;
int cpu;
cpu = get_cpu();
iostats = per_cpu_ptr(server->io_stats, cpu);
- iostats->events[stat] ++;
+ iostats->events[stat]++;
put_cpu_no_resched();
}
-static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat)
+static inline void nfs_inc_stats(const struct inode *inode,
+ enum nfs_stat_eventcounters stat)
{
nfs_inc_server_stats(NFS_SERVER(inode), stat);
}
-static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend)
+static inline void nfs_add_server_stats(const struct nfs_server *server,
+ enum nfs_stat_bytecounters stat,
+ unsigned long addend)
{
struct nfs_iostats *iostats;
int cpu;
@@ -144,7 +50,9 @@ static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat
put_cpu_no_resched();
}
-static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend)
+static inline void nfs_add_stats(const struct inode *inode,
+ enum nfs_stat_bytecounters stat,
+ unsigned long addend)
{
nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
}
@@ -160,5 +68,4 @@ static inline void nfs_free_iostats(struct nfs_iostats *stats)
free_percpu(stats);
}
-#endif
-#endif
+#endif /* _NFS_IOSTAT */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9b7362565c0c..423842f51ac9 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -5,6 +5,8 @@
#include <linux/posix_acl_xattr.h>
#include <linux/nfsacl.h>
+#include "internal.h"
+
#define NFSDBG_FACILITY NFSDBG_PROC
ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -205,6 +207,8 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
status = nfs_revalidate_inode(server, inode);
if (status < 0)
return ERR_PTR(status);
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
acl = nfs3_get_cached_acl(inode, type);
if (acl != ERR_PTR(-EAGAIN))
return acl;
@@ -319,9 +323,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
dprintk("NFS call setacl\n");
msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
status = rpc_call_sync(server->client_acl, &msg, 0);
- spin_lock(&inode->i_lock);
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
- spin_unlock(&inode->i_lock);
+ nfs_access_zap_cache(inode);
+ nfs_zap_acl_cache(inode);
dprintk("NFS reply setacl: %d\n", status);
/* pages may have been allocated at the xdr layer. */
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c3523ad03ed1..1e750e4574a9 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -129,6 +129,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
int status;
dprintk("NFS call setattr\n");
+ if (sattr->ia_valid & ATTR_FILE)
+ msg.rpc_cred = nfs_file_cred(sattr->ia_file);
nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
if (status == 0)
@@ -248,6 +250,53 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
return status;
}
+struct nfs3_createdata {
+ struct rpc_message msg;
+ union {
+ struct nfs3_createargs create;
+ struct nfs3_mkdirargs mkdir;
+ struct nfs3_symlinkargs symlink;
+ struct nfs3_mknodargs mknod;
+ } arg;
+ struct nfs3_diropres res;
+ struct nfs_fh fh;
+ struct nfs_fattr fattr;
+ struct nfs_fattr dir_attr;
+};
+
+static struct nfs3_createdata *nfs3_alloc_createdata(void)
+{
+ struct nfs3_createdata *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data != NULL) {
+ data->msg.rpc_argp = &data->arg;
+ data->msg.rpc_resp = &data->res;
+ data->res.fh = &data->fh;
+ data->res.fattr = &data->fattr;
+ data->res.dir_attr = &data->dir_attr;
+ nfs_fattr_init(data->res.fattr);
+ nfs_fattr_init(data->res.dir_attr);
+ }
+ return data;
+}
+
+static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
+{
+ int status;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+ nfs_post_op_update_inode(dir, data->res.dir_attr);
+ if (status == 0)
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ return status;
+}
+
+static void nfs3_free_createdata(struct nfs3_createdata *data)
+{
+ kfree(data);
+}
+
/*
* Create a regular file.
* For now, we don't implement O_EXCL.
@@ -256,70 +305,60 @@ static int
nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
int flags, struct nameidata *nd)
{
- struct nfs_fh fhandle;
- struct nfs_fattr fattr;
- struct nfs_fattr dir_attr;
- struct nfs3_createargs arg = {
- .fh = NFS_FH(dir),
- .name = dentry->d_name.name,
- .len = dentry->d_name.len,
- .sattr = sattr,
- };
- struct nfs3_diropres res = {
- .dir_attr = &dir_attr,
- .fh = &fhandle,
- .fattr = &fattr
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_CREATE],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
+ struct nfs3_createdata *data;
mode_t mode = sattr->ia_mode;
- int status;
+ int status = -ENOMEM;
dprintk("NFS call create %s\n", dentry->d_name.name);
- arg.createmode = NFS3_CREATE_UNCHECKED;
+
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
+ goto out;
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE];
+ data->arg.create.fh = NFS_FH(dir);
+ data->arg.create.name = dentry->d_name.name;
+ data->arg.create.len = dentry->d_name.len;
+ data->arg.create.sattr = sattr;
+
+ data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
if (flags & O_EXCL) {
- arg.createmode = NFS3_CREATE_EXCLUSIVE;
- arg.verifier[0] = jiffies;
- arg.verifier[1] = current->pid;
+ data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE;
+ data->arg.create.verifier[0] = jiffies;
+ data->arg.create.verifier[1] = current->pid;
}
sattr->ia_mode &= ~current->fs->umask;
-again:
- nfs_fattr_init(&dir_attr);
- nfs_fattr_init(&fattr);
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_refresh_inode(dir, &dir_attr);
+ for (;;) {
+ status = nfs3_do_create(dir, dentry, data);
- /* If the server doesn't support the exclusive creation semantics,
- * try again with simple 'guarded' mode. */
- if (status == -ENOTSUPP) {
- switch (arg.createmode) {
+ if (status != -ENOTSUPP)
+ break;
+ /* If the server doesn't support the exclusive creation
+ * semantics, try again with simple 'guarded' mode. */
+ switch (data->arg.create.createmode) {
case NFS3_CREATE_EXCLUSIVE:
- arg.createmode = NFS3_CREATE_GUARDED;
+ data->arg.create.createmode = NFS3_CREATE_GUARDED;
break;
case NFS3_CREATE_GUARDED:
- arg.createmode = NFS3_CREATE_UNCHECKED;
+ data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
break;
case NFS3_CREATE_UNCHECKED:
goto out;
}
- goto again;
+ nfs_fattr_init(data->res.dir_attr);
+ nfs_fattr_init(data->res.fattr);
}
- if (status == 0)
- status = nfs_instantiate(dentry, &fhandle, &fattr);
if (status != 0)
goto out;
/* When we created the file with exclusive semantics, make
* sure we set the attributes afterwards. */
- if (arg.createmode == NFS3_CREATE_EXCLUSIVE) {
+ if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
dprintk("NFS call setattr (post-create)\n");
if (!(sattr->ia_valid & ATTR_ATIME_SET))
@@ -330,14 +369,15 @@ again:
/* Note: we could use a guarded setattr here, but I'm
* not sure this buys us anything (and I'd have
* to revamp the NFSv3 XDR code) */
- status = nfs3_proc_setattr(dentry, &fattr, sattr);
- nfs_post_op_update_inode(dentry->d_inode, &fattr);
+ status = nfs3_proc_setattr(dentry, data->res.fattr, sattr);
+ nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
dprintk("NFS reply setattr (post-create): %d\n", status);
+ if (status != 0)
+ goto out;
}
- if (status != 0)
- goto out;
status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
out:
+ nfs3_free_createdata(data);
dprintk("NFS reply create: %d\n", status);
return status;
}
@@ -452,40 +492,28 @@ static int
nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
unsigned int len, struct iattr *sattr)
{
- struct nfs_fh fhandle;
- struct nfs_fattr fattr, dir_attr;
- struct nfs3_symlinkargs arg = {
- .fromfh = NFS_FH(dir),
- .fromname = dentry->d_name.name,
- .fromlen = dentry->d_name.len,
- .pages = &page,
- .pathlen = len,
- .sattr = sattr
- };
- struct nfs3_diropres res = {
- .dir_attr = &dir_attr,
- .fh = &fhandle,
- .fattr = &fattr
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status;
+ struct nfs3_createdata *data;
+ int status = -ENOMEM;
if (len > NFS3_MAXPATHLEN)
return -ENAMETOOLONG;
dprintk("NFS call symlink %s\n", dentry->d_name.name);
- nfs_fattr_init(&dir_attr);
- nfs_fattr_init(&fattr);
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_post_op_update_inode(dir, &dir_attr);
- if (status != 0)
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
goto out;
- status = nfs_instantiate(dentry, &fhandle, &fattr);
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK];
+ data->arg.symlink.fromfh = NFS_FH(dir);
+ data->arg.symlink.fromname = dentry->d_name.name;
+ data->arg.symlink.fromlen = dentry->d_name.len;
+ data->arg.symlink.pages = &page;
+ data->arg.symlink.pathlen = len;
+ data->arg.symlink.sattr = sattr;
+
+ status = nfs3_do_create(dir, dentry, data);
+
+ nfs3_free_createdata(data);
out:
dprintk("NFS reply symlink: %d\n", status);
return status;
@@ -494,42 +522,31 @@ out:
static int
nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
{
- struct nfs_fh fhandle;
- struct nfs_fattr fattr, dir_attr;
- struct nfs3_mkdirargs arg = {
- .fh = NFS_FH(dir),
- .name = dentry->d_name.name,
- .len = dentry->d_name.len,
- .sattr = sattr
- };
- struct nfs3_diropres res = {
- .dir_attr = &dir_attr,
- .fh = &fhandle,
- .fattr = &fattr
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
+ struct nfs3_createdata *data;
int mode = sattr->ia_mode;
- int status;
+ int status = -ENOMEM;
dprintk("NFS call mkdir %s\n", dentry->d_name.name);
sattr->ia_mode &= ~current->fs->umask;
- nfs_fattr_init(&dir_attr);
- nfs_fattr_init(&fattr);
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_post_op_update_inode(dir, &dir_attr);
- if (status != 0)
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
goto out;
- status = nfs_instantiate(dentry, &fhandle, &fattr);
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
+ data->arg.mkdir.fh = NFS_FH(dir);
+ data->arg.mkdir.name = dentry->d_name.name;
+ data->arg.mkdir.len = dentry->d_name.len;
+ data->arg.mkdir.sattr = sattr;
+
+ status = nfs3_do_create(dir, dentry, data);
if (status != 0)
goto out;
+
status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
out:
+ nfs3_free_createdata(data);
dprintk("NFS reply mkdir: %d\n", status);
return status;
}
@@ -615,52 +632,50 @@ static int
nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
dev_t rdev)
{
- struct nfs_fh fh;
- struct nfs_fattr fattr, dir_attr;
- struct nfs3_mknodargs arg = {
- .fh = NFS_FH(dir),
- .name = dentry->d_name.name,
- .len = dentry->d_name.len,
- .sattr = sattr,
- .rdev = rdev
- };
- struct nfs3_diropres res = {
- .dir_attr = &dir_attr,
- .fh = &fh,
- .fattr = &fattr
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
+ struct nfs3_createdata *data;
mode_t mode = sattr->ia_mode;
- int status;
-
- switch (sattr->ia_mode & S_IFMT) {
- case S_IFBLK: arg.type = NF3BLK; break;
- case S_IFCHR: arg.type = NF3CHR; break;
- case S_IFIFO: arg.type = NF3FIFO; break;
- case S_IFSOCK: arg.type = NF3SOCK; break;
- default: return -EINVAL;
- }
+ int status = -ENOMEM;
dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name,
MAJOR(rdev), MINOR(rdev));
sattr->ia_mode &= ~current->fs->umask;
- nfs_fattr_init(&dir_attr);
- nfs_fattr_init(&fattr);
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_post_op_update_inode(dir, &dir_attr);
- if (status != 0)
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
goto out;
- status = nfs_instantiate(dentry, &fh, &fattr);
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
+ data->arg.mknod.fh = NFS_FH(dir);
+ data->arg.mknod.name = dentry->d_name.name;
+ data->arg.mknod.len = dentry->d_name.len;
+ data->arg.mknod.sattr = sattr;
+ data->arg.mknod.rdev = rdev;
+
+ switch (sattr->ia_mode & S_IFMT) {
+ case S_IFBLK:
+ data->arg.mknod.type = NF3BLK;
+ break;
+ case S_IFCHR:
+ data->arg.mknod.type = NF3CHR;
+ break;
+ case S_IFIFO:
+ data->arg.mknod.type = NF3FIFO;
+ break;
+ case S_IFSOCK:
+ data->arg.mknod.type = NF3SOCK;
+ break;
+ default:
+ status = -EINVAL;
+ goto out;
+ }
+
+ status = nfs3_do_create(dir, dentry, data);
if (status != 0)
goto out;
status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
out:
+ nfs3_free_createdata(data);
dprintk("NFS reply mknod: %d\n", status);
return status;
}
@@ -801,8 +816,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.write_done = nfs3_write_done,
.commit_setup = nfs3_proc_commit_setup,
.commit_done = nfs3_commit_done,
- .file_open = nfs_open,
- .file_release = nfs_release,
.lock = nfs3_proc_lock,
.clear_acl_cache = nfs3_forget_cached_acls,
};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1293e0acd82b..c910413eaeca 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -451,9 +451,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
/* Save the delegation */
memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
rcu_read_unlock();
- lock_kernel();
ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
- unlock_kernel();
if (ret != 0)
goto out;
ret = -EAGAIN;
@@ -1139,8 +1137,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int
return res;
}
-static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
- struct iattr *sattr, struct nfs4_state *state)
+static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+ struct nfs_fattr *fattr, struct iattr *sattr,
+ struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_setattrargs arg = {
@@ -1154,9 +1153,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
.server = server,
};
struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
- .rpc_argp = &arg,
- .rpc_resp = &res,
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
};
unsigned long timestamp = jiffies;
int status;
@@ -1166,7 +1166,6 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
/* Use that stateid */
} else if (state != NULL) {
- msg.rpc_cred = state->owner->so_cred;
nfs4_copy_stateid(&arg.stateid, state, current->files);
} else
memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
@@ -1177,15 +1176,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
return status;
}
-static int nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
- struct iattr *sattr, struct nfs4_state *state)
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+ struct nfs_fattr *fattr, struct iattr *sattr,
+ struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_exception exception = { };
int err;
do {
err = nfs4_handle_exception(server,
- _nfs4_do_setattr(inode, fattr, sattr, state),
+ _nfs4_do_setattr(inode, cred, fattr, sattr, state),
&exception);
} while (exception.retry);
return err;
@@ -1647,29 +1647,25 @@ static int
nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct iattr *sattr)
{
- struct rpc_cred *cred;
struct inode *inode = dentry->d_inode;
- struct nfs_open_context *ctx;
+ struct rpc_cred *cred = NULL;
struct nfs4_state *state = NULL;
int status;
nfs_fattr_init(fattr);
- cred = rpc_lookup_cred();
- if (IS_ERR(cred))
- return PTR_ERR(cred);
-
/* Search for an existing open(O_WRITE) file */
- ctx = nfs_find_open_context(inode, cred, FMODE_WRITE);
- if (ctx != NULL)
+ if (sattr->ia_valid & ATTR_FILE) {
+ struct nfs_open_context *ctx;
+
+ ctx = nfs_file_open_context(sattr->ia_file);
+ cred = ctx->cred;
state = ctx->state;
+ }
- status = nfs4_do_setattr(inode, fattr, sattr, state);
+ status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
if (status == 0)
nfs_setattr_update_inode(inode, sattr);
- if (ctx != NULL)
- put_nfs_open_context(ctx);
- put_rpccred(cred);
return status;
}
@@ -1897,17 +1893,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
goto out;
}
state = nfs4_do_open(dir, &path, flags, sattr, cred);
- put_rpccred(cred);
d_drop(dentry);
if (IS_ERR(state)) {
status = PTR_ERR(state);
- goto out;
+ goto out_putcred;
}
d_add(dentry, igrab(state->inode));
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
if (flags & O_EXCL) {
struct nfs_fattr fattr;
- status = nfs4_do_setattr(state->inode, &fattr, sattr, state);
+ status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
if (status == 0)
nfs_setattr_update_inode(state->inode, sattr);
nfs_post_op_update_inode(state->inode, &fattr);
@@ -1916,6 +1911,8 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = nfs4_intent_set_file(nd, &path, state);
else
nfs4_close_sync(&path, state, flags);
+out_putcred:
+ put_rpccred(cred);
out:
return status;
}
@@ -2079,47 +2076,81 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n
return err;
}
+struct nfs4_createdata {
+ struct rpc_message msg;
+ struct nfs4_create_arg arg;
+ struct nfs4_create_res res;
+ struct nfs_fh fh;
+ struct nfs_fattr fattr;
+ struct nfs_fattr dir_fattr;
+};
+
+static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
+ struct qstr *name, struct iattr *sattr, u32 ftype)
+{
+ struct nfs4_createdata *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data != NULL) {
+ struct nfs_server *server = NFS_SERVER(dir);
+
+ data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
+ data->msg.rpc_argp = &data->arg;
+ data->msg.rpc_resp = &data->res;
+ data->arg.dir_fh = NFS_FH(dir);
+ data->arg.server = server;
+ data->arg.name = name;
+ data->arg.attrs = sattr;
+ data->arg.ftype = ftype;
+ data->arg.bitmask = server->attr_bitmask;
+ data->res.server = server;
+ data->res.fh = &data->fh;
+ data->res.fattr = &data->fattr;
+ data->res.dir_fattr = &data->dir_fattr;
+ nfs_fattr_init(data->res.fattr);
+ nfs_fattr_init(data->res.dir_fattr);
+ }
+ return data;
+}
+
+static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
+{
+ int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+ if (status == 0) {
+ update_changeattr(dir, &data->res.dir_cinfo);
+ nfs_post_op_update_inode(dir, data->res.dir_fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ }
+ return status;
+}
+
+static void nfs4_free_createdata(struct nfs4_createdata *data)
+{
+ kfree(data);
+}
+
static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
struct page *page, unsigned int len, struct iattr *sattr)
{
- struct nfs_server *server = NFS_SERVER(dir);
- struct nfs_fh fhandle;
- struct nfs_fattr fattr, dir_fattr;
- struct nfs4_create_arg arg = {
- .dir_fh = NFS_FH(dir),
- .server = server,
- .name = &dentry->d_name,
- .attrs = sattr,
- .ftype = NF4LNK,
- .bitmask = server->attr_bitmask,
- };
- struct nfs4_create_res res = {
- .server = server,
- .fh = &fhandle,
- .fattr = &fattr,
- .dir_fattr = &dir_fattr,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status;
+ struct nfs4_createdata *data;
+ int status = -ENAMETOOLONG;
if (len > NFS4_MAXPATHLEN)
- return -ENAMETOOLONG;
+ goto out;
- arg.u.symlink.pages = &page;
- arg.u.symlink.len = len;
- nfs_fattr_init(&fattr);
- nfs_fattr_init(&dir_fattr);
+ status = -ENOMEM;
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK);
+ if (data == NULL)
+ goto out;
+
+ data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
+ data->arg.u.symlink.pages = &page;
+ data->arg.u.symlink.len = len;
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- if (!status) {
- update_changeattr(dir, &res.dir_cinfo);
- nfs_post_op_update_inode(dir, res.dir_fattr);
- status = nfs_instantiate(dentry, &fhandle, &fattr);
- }
+ status = nfs4_do_create(dir, dentry, data);
+
+ nfs4_free_createdata(data);
+out:
return status;
}
@@ -2140,39 +2171,17 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
struct iattr *sattr)
{
- struct nfs_server *server = NFS_SERVER(dir);
- struct nfs_fh fhandle;
- struct nfs_fattr fattr, dir_fattr;
- struct nfs4_create_arg arg = {
- .dir_fh = NFS_FH(dir),
- .server = server,
- .name = &dentry->d_name,
- .attrs = sattr,
- .ftype = NF4DIR,
- .bitmask = server->attr_bitmask,
- };
- struct nfs4_create_res res = {
- .server = server,
- .fh = &fhandle,
- .fattr = &fattr,
- .dir_fattr = &dir_fattr,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status;
+ struct nfs4_createdata *data;
+ int status = -ENOMEM;
- nfs_fattr_init(&fattr);
- nfs_fattr_init(&dir_fattr);
-
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- if (!status) {
- update_changeattr(dir, &res.dir_cinfo);
- nfs_post_op_update_inode(dir, res.dir_fattr);
- status = nfs_instantiate(dentry, &fhandle, &fattr);
- }
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
+ if (data == NULL)
+ goto out;
+
+ status = nfs4_do_create(dir, dentry, data);
+
+ nfs4_free_createdata(data);
+out:
return status;
}
@@ -2242,56 +2251,34 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
struct iattr *sattr, dev_t rdev)
{
- struct nfs_server *server = NFS_SERVER(dir);
- struct nfs_fh fh;
- struct nfs_fattr fattr, dir_fattr;
- struct nfs4_create_arg arg = {
- .dir_fh = NFS_FH(dir),
- .server = server,
- .name = &dentry->d_name,
- .attrs = sattr,
- .bitmask = server->attr_bitmask,
- };
- struct nfs4_create_res res = {
- .server = server,
- .fh = &fh,
- .fattr = &fattr,
- .dir_fattr = &dir_fattr,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status;
- int mode = sattr->ia_mode;
-
- nfs_fattr_init(&fattr);
- nfs_fattr_init(&dir_fattr);
+ struct nfs4_createdata *data;
+ int mode = sattr->ia_mode;
+ int status = -ENOMEM;
BUG_ON(!(sattr->ia_valid & ATTR_MODE));
BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
+
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
+ if (data == NULL)
+ goto out;
+
if (S_ISFIFO(mode))
- arg.ftype = NF4FIFO;
+ data->arg.ftype = NF4FIFO;
else if (S_ISBLK(mode)) {
- arg.ftype = NF4BLK;
- arg.u.device.specdata1 = MAJOR(rdev);
- arg.u.device.specdata2 = MINOR(rdev);
+ data->arg.ftype = NF4BLK;
+ data->arg.u.device.specdata1 = MAJOR(rdev);
+ data->arg.u.device.specdata2 = MINOR(rdev);
}
else if (S_ISCHR(mode)) {
- arg.ftype = NF4CHR;
- arg.u.device.specdata1 = MAJOR(rdev);
- arg.u.device.specdata2 = MINOR(rdev);
+ data->arg.ftype = NF4CHR;
+ data->arg.u.device.specdata1 = MAJOR(rdev);
+ data->arg.u.device.specdata2 = MINOR(rdev);
}
- else
- arg.ftype = NF4SOCK;
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- if (status == 0) {
- update_changeattr(dir, &res.dir_cinfo);
- nfs_post_op_update_inode(dir, res.dir_fattr);
- status = nfs_instantiate(dentry, &fh, &fattr);
- }
+ status = nfs4_do_create(dir, dentry, data);
+
+ nfs4_free_createdata(data);
+out:
return status;
}
@@ -2706,6 +2693,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
ret = nfs_revalidate_inode(server, inode);
if (ret < 0)
return ret;
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
ret = nfs4_read_cached_acl(inode, buf, buflen);
if (ret != -ENOENT)
return ret;
@@ -2733,7 +2722,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
nfs_inode_return_delegation(inode);
buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
- nfs_zap_caches(inode);
+ nfs_access_zap_cache(inode);
+ nfs_zap_acl_cache(inode);
return ret;
}
@@ -2767,8 +2757,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
task->tk_status = 0;
return -EAGAIN;
case -NFS4ERR_DELAY:
- nfs_inc_server_stats((struct nfs_server *) server,
- NFSIOS_DELAY);
+ nfs_inc_server_stats(server, NFSIOS_DELAY);
case -NFS4ERR_GRACE:
rpc_delay(task, NFS4_POLL_RETRY_MAX);
task->tk_status = 0;
@@ -2933,7 +2922,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
{
- long timeout;
+ long timeout = 0;
int err;
do {
err = _nfs4_proc_setclientid_confirm(clp, cred);
@@ -3725,8 +3714,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.write_done = nfs4_write_done,
.commit_setup = nfs4_proc_commit_setup,
.commit_done = nfs4_commit_done,
- .file_open = nfs_open,
- .file_release = nfs_release,
.lock = nfs4_proc_lock,
.clear_acl_cache = nfs4_zap_acl_attr,
};
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 856a8934f610..401ef8b28f97 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -940,7 +940,6 @@ static int reclaimer(void *ptr)
allow_signal(SIGKILL);
/* Ensure exclusive access to NFSv4 state */
- lock_kernel();
down_write(&clp->cl_sem);
/* Are there any NFS mounts out there? */
if (list_empty(&clp->cl_superblocks))
@@ -1000,7 +999,6 @@ restart_loop:
nfs_delegation_reap_unclaimed(clp);
out:
up_write(&clp->cl_sem);
- unlock_kernel();
if (status == -NFS4ERR_CB_PATH_DOWN)
nfs_handle_cb_pathdown(clp);
nfs4_clear_recover_bit(clp);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 531379d36823..46763d1cd397 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -1,6 +1,4 @@
/*
- * $Id: nfsroot.c,v 1.45 1998/03/07 10:44:46 mj Exp $
- *
* Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de>
*
* Allow an NFS filesystem to be mounted as root. The way this works is:
@@ -297,10 +295,10 @@ static int __init root_nfs_name(char *name)
nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */
nfs_data.rsize = NFS_DEF_FILE_IO_SIZE;
nfs_data.wsize = NFS_DEF_FILE_IO_SIZE;
- nfs_data.acregmin = 3;
- nfs_data.acregmax = 60;
- nfs_data.acdirmin = 30;
- nfs_data.acdirmax = 60;
+ nfs_data.acregmin = NFS_DEF_ACREGMIN;
+ nfs_data.acregmax = NFS_DEF_ACREGMAX;
+ nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
+ nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
strcpy(buf, NFS_ROOT);
/* Process options received from the remote server */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 03599bfe81cf..4dbb84df1b68 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -129,6 +129,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
sattr->ia_mode &= S_IALLUGO;
dprintk("NFS call setattr\n");
+ if (sattr->ia_valid & ATTR_FILE)
+ msg.rpc_cred = nfs_file_cred(sattr->ia_file);
nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
if (status == 0)
@@ -598,6 +600,29 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
}
+/* Helper functions for NFS lock bounds checking */
+#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL)
+static int nfs_lock_check_bounds(const struct file_lock *fl)
+{
+ __s32 start, end;
+
+ start = (__s32)fl->fl_start;
+ if ((loff_t)start != fl->fl_start)
+ goto out_einval;
+
+ if (fl->fl_end != OFFSET_MAX) {
+ end = (__s32)fl->fl_end;
+ if ((loff_t)end != fl->fl_end)
+ goto out_einval;
+ } else
+ end = NFS_LOCK32_OFFSET_MAX;
+
+ if (start < 0 || start > end)
+ goto out_einval;
+ return 0;
+out_einval:
+ return -EINVAL;
+}
const struct nfs_rpc_ops nfs_v2_clientops = {
.version = 2, /* protocol version */
@@ -630,7 +655,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.write_setup = nfs_proc_write_setup,
.write_done = nfs_write_done,
.commit_setup = nfs_proc_commit_setup,
- .file_open = nfs_open,
- .file_release = nfs_release,
.lock = nfs_proc_lock,
+ .lock_check_bounds = nfs_lock_check_bounds,
};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 614efeed5437..1b94e3650f5c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -47,6 +47,7 @@
#include <linux/inet.h>
#include <linux/in6.h>
#include <net/ipv6.h>
+#include <linux/netdevice.h>
#include <linux/nfs_xdr.h>
#include <linux/magic.h>
#include <linux/parser.h>
@@ -65,7 +66,6 @@
enum {
/* Mount options that take no arguments */
Opt_soft, Opt_hard,
- Opt_intr, Opt_nointr,
Opt_posix, Opt_noposix,
Opt_cto, Opt_nocto,
Opt_ac, Opt_noac,
@@ -92,8 +92,8 @@ enum {
Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
Opt_addr, Opt_mountaddr, Opt_clientaddr,
- /* Mount options that are ignored */
- Opt_userspace, Opt_deprecated,
+ /* Special mount options */
+ Opt_userspace, Opt_deprecated, Opt_sloppy,
Opt_err
};
@@ -101,10 +101,14 @@ enum {
static match_table_t nfs_mount_option_tokens = {
{ Opt_userspace, "bg" },
{ Opt_userspace, "fg" },
+ { Opt_userspace, "retry=%s" },
+
+ { Opt_sloppy, "sloppy" },
+
{ Opt_soft, "soft" },
{ Opt_hard, "hard" },
- { Opt_intr, "intr" },
- { Opt_nointr, "nointr" },
+ { Opt_deprecated, "intr" },
+ { Opt_deprecated, "nointr" },
{ Opt_posix, "posix" },
{ Opt_noposix, "noposix" },
{ Opt_cto, "cto" },
@@ -136,7 +140,6 @@ static match_table_t nfs_mount_option_tokens = {
{ Opt_acdirmin, "acdirmin=%u" },
{ Opt_acdirmax, "acdirmax=%u" },
{ Opt_actimeo, "actimeo=%u" },
- { Opt_userspace, "retry=%u" },
{ Opt_namelen, "namlen=%u" },
{ Opt_mountport, "mountport=%u" },
{ Opt_mountvers, "mountvers=%u" },
@@ -207,6 +210,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
static void nfs_kill_super(struct super_block *);
static void nfs_put_super(struct super_block *);
+static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
static struct file_system_type nfs_fs_type = {
.owner = THIS_MODULE,
@@ -234,6 +238,7 @@ static const struct super_operations nfs_sops = {
.umount_begin = nfs_umount_begin,
.show_options = nfs_show_options,
.show_stats = nfs_show_stats,
+ .remount_fs = nfs_remount,
};
#ifdef CONFIG_NFS_V4
@@ -278,6 +283,7 @@ static const struct super_operations nfs4_sops = {
.umount_begin = nfs_umount_begin,
.show_options = nfs_show_options,
.show_stats = nfs_show_stats,
+ .remount_fs = nfs_remount,
};
#endif
@@ -368,8 +374,6 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
};
int error;
- lock_kernel();
-
error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
if (error < 0)
goto out_err;
@@ -401,12 +405,10 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_namelen = server->namelen;
- unlock_kernel();
return 0;
out_err:
dprintk("%s: statfs error = %d\n", __func__, -error);
- unlock_kernel();
return error;
}
@@ -514,13 +516,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
if (nfss->bsize != 0)
seq_printf(m, ",bsize=%u", nfss->bsize);
seq_printf(m, ",namlen=%u", nfss->namelen);
- if (nfss->acregmin != 3*HZ || showdefaults)
+ if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults)
seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
- if (nfss->acregmax != 60*HZ || showdefaults)
+ if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults)
seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ);
- if (nfss->acdirmin != 30*HZ || showdefaults)
+ if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults)
seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
- if (nfss->acdirmax != 60*HZ || showdefaults)
+ if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults)
seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
if (nfss->flags & nfs_infop->flag)
@@ -702,49 +704,233 @@ static int nfs_verify_server_address(struct sockaddr *addr)
return 0;
}
+static void nfs_parse_ipv4_address(char *string, size_t str_len,
+ struct sockaddr *sap, size_t *addr_len)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ u8 *addr = (u8 *)&sin->sin_addr.s_addr;
+
+ if (str_len <= INET_ADDRSTRLEN) {
+ dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
+ (int)str_len, string);
+
+ sin->sin_family = AF_INET;
+ *addr_len = sizeof(*sin);
+ if (in4_pton(string, str_len, addr, '\0', NULL))
+ return;
+ }
+
+ sap->sa_family = AF_UNSPEC;
+ *addr_len = 0;
+}
+
+#define IPV6_SCOPE_DELIMITER '%'
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
+ const char *delim,
+ struct sockaddr_in6 *sin6)
+{
+ char *p;
+ size_t len;
+
+ if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+ return ;
+ if (*delim != IPV6_SCOPE_DELIMITER)
+ return;
+
+ len = (string + str_len) - delim - 1;
+ p = kstrndup(delim + 1, len, GFP_KERNEL);
+ if (p) {
+ unsigned long scope_id = 0;
+ struct net_device *dev;
+
+ dev = dev_get_by_name(&init_net, p);
+ if (dev != NULL) {
+ scope_id = dev->ifindex;
+ dev_put(dev);
+ } else {
+ /* scope_id is set to zero on error */
+ strict_strtoul(p, 10, &scope_id);
+ }
+
+ kfree(p);
+ sin6->sin6_scope_id = scope_id;
+ dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
+ }
+}
+
+static void nfs_parse_ipv6_address(char *string, size_t str_len,
+ struct sockaddr *sap, size_t *addr_len)
+{
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
+ const char *delim;
+
+ if (str_len <= INET6_ADDRSTRLEN) {
+ dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
+ (int)str_len, string);
+
+ sin6->sin6_family = AF_INET6;
+ *addr_len = sizeof(*sin6);
+ if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) {
+ nfs_parse_ipv6_scope_id(string, str_len, delim, sin6);
+ return;
+ }
+ }
+
+ sap->sa_family = AF_UNSPEC;
+ *addr_len = 0;
+}
+#else
+static void nfs_parse_ipv6_address(char *string, size_t str_len,
+ struct sockaddr *sap, size_t *addr_len)
+{
+ sap->sa_family = AF_UNSPEC;
+ *addr_len = 0;
+}
+#endif
+
/*
- * Parse string addresses passed in via a mount option,
- * and construct a sockaddr based on the result.
+ * Construct a sockaddr based on the contents of a string that contains
+ * an IP address in presentation format.
*
- * If address parsing fails, set the sockaddr's address
- * family to AF_UNSPEC to force nfs_verify_server_address()
- * to punt the mount.
+ * If there is a problem constructing the new sockaddr, set the address
+ * family to AF_UNSPEC.
*/
-static void nfs_parse_server_address(char *value,
- struct sockaddr *sap,
- size_t *len)
+static void nfs_parse_ip_address(char *string, size_t str_len,
+ struct sockaddr *sap, size_t *addr_len)
{
- if (strchr(value, ':')) {
- struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
- u8 *addr = (u8 *)&ap->sin6_addr.in6_u;
+ unsigned int i, colons;
- ap->sin6_family = AF_INET6;
- *len = sizeof(*ap);
- if (in6_pton(value, -1, addr, '\0', NULL))
- return;
- } else {
- struct sockaddr_in *ap = (struct sockaddr_in *)sap;
- u8 *addr = (u8 *)&ap->sin_addr.s_addr;
+ colons = 0;
+ for (i = 0; i < str_len; i++)
+ if (string[i] == ':')
+ colons++;
+
+ if (colons >= 2)
+ nfs_parse_ipv6_address(string, str_len, sap, addr_len);
+ else
+ nfs_parse_ipv4_address(string, str_len, sap, addr_len);
+}
+
+/*
+ * Sanity check the NFS transport protocol.
+ *
+ */
+static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+ switch (mnt->nfs_server.protocol) {
+ case XPRT_TRANSPORT_UDP:
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_RDMA:
+ break;
+ default:
+ mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ }
+}
+
+/*
+ * For text based NFSv2/v3 mounts, the mount protocol transport default
+ * settings should depend upon the specified NFS transport.
+ */
+static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+ nfs_validate_transport_protocol(mnt);
- ap->sin_family = AF_INET;
- *len = sizeof(*ap);
- if (in4_pton(value, -1, addr, '\0', NULL))
+ if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP ||
+ mnt->mount_server.protocol == XPRT_TRANSPORT_TCP)
return;
+ switch (mnt->nfs_server.protocol) {
+ case XPRT_TRANSPORT_UDP:
+ mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
+ break;
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_RDMA:
+ mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
}
+}
- sap->sa_family = AF_UNSPEC;
- *len = 0;
+/*
+ * Parse the value of the 'sec=' option.
+ *
+ * The flavor_len setting is for v4 mounts.
+ */
+static int nfs_parse_security_flavors(char *value,
+ struct nfs_parsed_mount_data *mnt)
+{
+ substring_t args[MAX_OPT_ARGS];
+
+ dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
+
+ switch (match_token(value, nfs_secflavor_tokens, args)) {
+ case Opt_sec_none:
+ mnt->auth_flavor_len = 0;
+ mnt->auth_flavors[0] = RPC_AUTH_NULL;
+ break;
+ case Opt_sec_sys:
+ mnt->auth_flavor_len = 0;
+ mnt->auth_flavors[0] = RPC_AUTH_UNIX;
+ break;
+ case Opt_sec_krb5:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
+ break;
+ case Opt_sec_krb5i:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
+ break;
+ case Opt_sec_krb5p:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
+ break;
+ case Opt_sec_lkey:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
+ break;
+ case Opt_sec_lkeyi:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
+ break;
+ case Opt_sec_lkeyp:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
+ break;
+ case Opt_sec_spkm:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
+ break;
+ case Opt_sec_spkmi:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
+ break;
+ case Opt_sec_spkmp:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
+ break;
+ default:
+ return 0;
+ }
+
+ return 1;
+}
+
+static void nfs_parse_invalid_value(const char *option)
+{
+ dfprintk(MOUNT, "NFS: bad value specified for %s option\n", option);
}
/*
* Error-check and convert a string of mount options from user space into
- * a data structure
+ * a data structure. The whole mount string is processed; bad options are
+ * skipped as they are encountered. If there were no errors, return 1;
+ * otherwise return 0 (zero).
*/
static int nfs_parse_mount_options(char *raw,
struct nfs_parsed_mount_data *mnt)
{
char *p, *string, *secdata;
- int rc;
+ int rc, sloppy = 0, errors = 0;
if (!raw) {
dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -777,15 +963,16 @@ static int nfs_parse_mount_options(char *raw,
token = match_token(p, nfs_mount_option_tokens, args);
switch (token) {
+
+ /*
+ * boolean options: foo/nofoo
+ */
case Opt_soft:
mnt->flags |= NFS_MOUNT_SOFT;
break;
case Opt_hard:
mnt->flags &= ~NFS_MOUNT_SOFT;
break;
- case Opt_intr:
- case Opt_nointr:
- break;
case Opt_posix:
mnt->flags |= NFS_MOUNT_POSIX;
break;
@@ -819,20 +1006,14 @@ static int nfs_parse_mount_options(char *raw,
case Opt_udp:
mnt->flags &= ~NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
- mnt->timeo = 7;
- mnt->retrans = 5;
break;
case Opt_tcp:
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
- mnt->timeo = 600;
- mnt->retrans = 2;
break;
case Opt_rdma:
mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
- mnt->timeo = 600;
- mnt->retrans = 2;
break;
case Opt_acl:
mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -853,165 +1034,144 @@ static int nfs_parse_mount_options(char *raw,
mnt->flags |= NFS_MOUNT_UNSHARED;
break;
+ /*
+ * options that take numeric values
+ */
case Opt_port:
- if (match_int(args, &option))
- return 0;
- if (option < 0 || option > 65535)
- return 0;
- mnt->nfs_server.port = option;
+ if (match_int(args, &option) ||
+ option < 0 || option > USHORT_MAX) {
+ errors++;
+ nfs_parse_invalid_value("port");
+ } else
+ mnt->nfs_server.port = option;
break;
case Opt_rsize:
- if (match_int(args, &mnt->rsize))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("rsize");
+ } else
+ mnt->rsize = option;
break;
case Opt_wsize:
- if (match_int(args, &mnt->wsize))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("wsize");
+ } else
+ mnt->wsize = option;
break;
case Opt_bsize:
- if (match_int(args, &option))
- return 0;
- if (option < 0)
- return 0;
- mnt->bsize = option;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("bsize");
+ } else
+ mnt->bsize = option;
break;
case Opt_timeo:
- if (match_int(args, &mnt->timeo))
- return 0;
+ if (match_int(args, &option) || option <= 0) {
+ errors++;
+ nfs_parse_invalid_value("timeo");
+ } else
+ mnt->timeo = option;
break;
case Opt_retrans:
- if (match_int(args, &mnt->retrans))
- return 0;
+ if (match_int(args, &option) || option <= 0) {
+ errors++;
+ nfs_parse_invalid_value("retrans");
+ } else
+ mnt->retrans = option;
break;
case Opt_acregmin:
- if (match_int(args, &mnt->acregmin))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("acregmin");
+ } else
+ mnt->acregmin = option;
break;
case Opt_acregmax:
- if (match_int(args, &mnt->acregmax))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("acregmax");
+ } else
+ mnt->acregmax = option;
break;
case Opt_acdirmin:
- if (match_int(args, &mnt->acdirmin))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("acdirmin");
+ } else
+ mnt->acdirmin = option;
break;
case Opt_acdirmax:
- if (match_int(args, &mnt->acdirmax))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("acdirmax");
+ } else
+ mnt->acdirmax = option;
break;
case Opt_actimeo:
- if (match_int(args, &option))
- return 0;
- if (option < 0)
- return 0;
- mnt->acregmin =
- mnt->acregmax =
- mnt->acdirmin =
- mnt->acdirmax = option;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("actimeo");
+ } else
+ mnt->acregmin = mnt->acregmax =
+ mnt->acdirmin = mnt->acdirmax = option;
break;
case Opt_namelen:
- if (match_int(args, &mnt->namlen))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("namlen");
+ } else
+ mnt->namlen = option;
break;
case Opt_mountport:
- if (match_int(args, &option))
- return 0;
- if (option < 0 || option > 65535)
- return 0;
- mnt->mount_server.port = option;
+ if (match_int(args, &option) ||
+ option < 0 || option > USHORT_MAX) {
+ errors++;
+ nfs_parse_invalid_value("mountport");
+ } else
+ mnt->mount_server.port = option;
break;
case Opt_mountvers:
- if (match_int(args, &option))
- return 0;
- if (option < 0)
- return 0;
- mnt->mount_server.version = option;
+ if (match_int(args, &option) ||
+ option < NFS_MNT_VERSION ||
+ option > NFS_MNT3_VERSION) {
+ errors++;
+ nfs_parse_invalid_value("mountvers");
+ } else
+ mnt->mount_server.version = option;
break;
case Opt_nfsvers:
- if (match_int(args, &option))
- return 0;
+ if (match_int(args, &option)) {
+ errors++;
+ nfs_parse_invalid_value("nfsvers");
+ break;
+ }
switch (option) {
- case 2:
+ case NFS2_VERSION:
mnt->flags &= ~NFS_MOUNT_VER3;
break;
- case 3:
+ case NFS3_VERSION:
mnt->flags |= NFS_MOUNT_VER3;
break;
default:
- goto out_unrec_vers;
+ errors++;
+ nfs_parse_invalid_value("nfsvers");
}
break;
+ /*
+ * options that take text values
+ */
case Opt_sec:
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- token = match_token(string, nfs_secflavor_tokens, args);
+ rc = nfs_parse_security_flavors(string, mnt);
kfree(string);
-
- /*
- * The flags setting is for v2/v3. The flavor_len
- * setting is for v4. v2/v3 also need to know the
- * difference between NULL and UNIX.
- */
- switch (token) {
- case Opt_sec_none:
- mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 0;
- mnt->auth_flavors[0] = RPC_AUTH_NULL;
- break;
- case Opt_sec_sys:
- mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 0;
- mnt->auth_flavors[0] = RPC_AUTH_UNIX;
- break;
- case Opt_sec_krb5:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
- break;
- case Opt_sec_krb5i:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
- break;
- case Opt_sec_krb5p:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
- break;
- case Opt_sec_lkey:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
- break;
- case Opt_sec_lkeyi:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
- break;
- case Opt_sec_lkeyp:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
- break;
- case Opt_sec_spkm:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
- break;
- case Opt_sec_spkmi:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
- break;
- case Opt_sec_spkmp:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
- break;
- default:
- goto out_unrec_sec;
+ if (!rc) {
+ errors++;
+ dfprintk(MOUNT, "NFS: unrecognized "
+ "security flavor\n");
}
break;
case Opt_proto:
@@ -1026,24 +1186,20 @@ static int nfs_parse_mount_options(char *raw,
case Opt_xprt_udp:
mnt->flags &= ~NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
- mnt->timeo = 7;
- mnt->retrans = 5;
break;
case Opt_xprt_tcp:
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
- mnt->timeo = 600;
- mnt->retrans = 2;
break;
case Opt_xprt_rdma:
/* vector side protocols to TCP */
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
- mnt->timeo = 600;
- mnt->retrans = 2;
break;
default:
- goto out_unrec_xprt;
+ errors++;
+ dfprintk(MOUNT, "NFS: unrecognized "
+ "transport protocol\n");
}
break;
case Opt_mountproto:
@@ -1063,16 +1219,19 @@ static int nfs_parse_mount_options(char *raw,
break;
case Opt_xprt_rdma: /* not used for side protocols */
default:
- goto out_unrec_xprt;
+ errors++;
+ dfprintk(MOUNT, "NFS: unrecognized "
+ "transport protocol\n");
}
break;
case Opt_addr:
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- nfs_parse_server_address(string, (struct sockaddr *)
- &mnt->nfs_server.address,
- &mnt->nfs_server.addrlen);
+ nfs_parse_ip_address(string, strlen(string),
+ (struct sockaddr *)
+ &mnt->nfs_server.address,
+ &mnt->nfs_server.addrlen);
kfree(string);
break;
case Opt_clientaddr:
@@ -1093,24 +1252,33 @@ static int nfs_parse_mount_options(char *raw,
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- nfs_parse_server_address(string, (struct sockaddr *)
- &mnt->mount_server.address,
- &mnt->mount_server.addrlen);
+ nfs_parse_ip_address(string, strlen(string),
+ (struct sockaddr *)
+ &mnt->mount_server.address,
+ &mnt->mount_server.addrlen);
kfree(string);
break;
+ /*
+ * Special options
+ */
+ case Opt_sloppy:
+ sloppy = 1;
+ dfprintk(MOUNT, "NFS: relaxing parsing rules\n");
+ break;
case Opt_userspace:
case Opt_deprecated:
+ dfprintk(MOUNT, "NFS: ignoring mount option "
+ "'%s'\n", p);
break;
default:
- goto out_unknown;
+ errors++;
+ dfprintk(MOUNT, "NFS: unrecognized mount option "
+ "'%s'\n", p);
}
}
- nfs_set_port((struct sockaddr *)&mnt->nfs_server.address,
- mnt->nfs_server.port);
-
return 1;
out_nomem:
@@ -1120,21 +1288,6 @@ out_security_failure:
free_secdata(secdata);
printk(KERN_INFO "NFS: security options invalid: %d\n", rc);
return 0;
-out_unrec_vers:
- printk(KERN_INFO "NFS: unrecognized NFS version number\n");
- return 0;
-
-out_unrec_xprt:
- printk(KERN_INFO "NFS: unrecognized transport protocol\n");
- return 0;
-
-out_unrec_sec:
- printk(KERN_INFO "NFS: unrecognized security flavor\n");
- return 0;
-
-out_unknown:
- printk(KERN_INFO "NFS: unknown mount option: %s\n", p);
- return 0;
}
/*
@@ -1188,11 +1341,146 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
if (status == 0)
return 0;
- dfprintk(MOUNT, "NFS: unable to mount server %s, error %d",
+ dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
hostname, status);
return status;
}
+static int nfs_parse_simple_hostname(const char *dev_name,
+ char **hostname, size_t maxnamlen,
+ char **export_path, size_t maxpathlen)
+{
+ size_t len;
+ char *colon, *comma;
+
+ colon = strchr(dev_name, ':');
+ if (colon == NULL)
+ goto out_bad_devname;
+
+ len = colon - dev_name;
+ if (len > maxnamlen)
+ goto out_hostname;
+
+ /* N.B. caller will free nfs_server.hostname in all cases */
+ *hostname = kstrndup(dev_name, len, GFP_KERNEL);
+ if (!*hostname)
+ goto out_nomem;
+
+ /* kill possible hostname list: not supported */
+ comma = strchr(*hostname, ',');
+ if (comma != NULL) {
+ if (comma == *hostname)
+ goto out_bad_devname;
+ *comma = '\0';
+ }
+
+ colon++;
+ len = strlen(colon);
+ if (len > maxpathlen)
+ goto out_path;
+ *export_path = kstrndup(colon, len, GFP_KERNEL);
+ if (!*export_path)
+ goto out_nomem;
+
+ dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
+ return 0;
+
+out_bad_devname:
+ dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+ return -EINVAL;
+
+out_nomem:
+ dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+ return -ENOMEM;
+
+out_hostname:
+ dfprintk(MOUNT, "NFS: server hostname too long\n");
+ return -ENAMETOOLONG;
+
+out_path:
+ dfprintk(MOUNT, "NFS: export pathname too long\n");
+ return -ENAMETOOLONG;
+}
+
+/*
+ * Hostname has square brackets around it because it contains one or
+ * more colons. We look for the first closing square bracket, and a
+ * colon must follow it.
+ */
+static int nfs_parse_protected_hostname(const char *dev_name,
+ char **hostname, size_t maxnamlen,
+ char **export_path, size_t maxpathlen)
+{
+ size_t len;
+ char *start, *end;
+
+ start = (char *)(dev_name + 1);
+
+ end = strchr(start, ']');
+ if (end == NULL)
+ goto out_bad_devname;
+ if (*(end + 1) != ':')
+ goto out_bad_devname;
+
+ len = end - start;
+ if (len > maxnamlen)
+ goto out_hostname;
+
+ /* N.B. caller will free nfs_server.hostname in all cases */
+ *hostname = kstrndup(start, len, GFP_KERNEL);
+ if (*hostname == NULL)
+ goto out_nomem;
+
+ end += 2;
+ len = strlen(end);
+ if (len > maxpathlen)
+ goto out_path;
+ *export_path = kstrndup(end, len, GFP_KERNEL);
+ if (!*export_path)
+ goto out_nomem;
+
+ return 0;
+
+out_bad_devname:
+ dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+ return -EINVAL;
+
+out_nomem:
+ dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+ return -ENOMEM;
+
+out_hostname:
+ dfprintk(MOUNT, "NFS: server hostname too long\n");
+ return -ENAMETOOLONG;
+
+out_path:
+ dfprintk(MOUNT, "NFS: export pathname too long\n");
+ return -ENAMETOOLONG;
+}
+
+/*
+ * Split "dev_name" into "hostname:export_path".
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path. If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+ char **hostname, size_t maxnamlen,
+ char **export_path, size_t maxpathlen)
+{
+ if (*dev_name == '[')
+ return nfs_parse_protected_hostname(dev_name,
+ hostname, maxnamlen,
+ export_path, maxpathlen);
+
+ return nfs_parse_simple_hostname(dev_name,
+ hostname, maxnamlen,
+ export_path, maxpathlen);
+}
+
/*
* Validate the NFS2/NFS3 mount data
* - fills in the mount root filehandle
@@ -1222,16 +1510,14 @@ static int nfs_validate_mount_data(void *options,
args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
args->rsize = NFS_MAX_FILE_IO_SIZE;
args->wsize = NFS_MAX_FILE_IO_SIZE;
- args->timeo = 600;
- args->retrans = 2;
- args->acregmin = 3;
- args->acregmax = 60;
- args->acdirmin = 30;
- args->acdirmax = 60;
+ args->acregmin = NFS_DEF_ACREGMIN;
+ args->acregmax = NFS_DEF_ACREGMAX;
+ args->acdirmin = NFS_DEF_ACDIRMIN;
+ args->acdirmax = NFS_DEF_ACDIRMAX;
args->mount_server.port = 0; /* autobind unless user sets port */
- args->mount_server.protocol = XPRT_TRANSPORT_UDP;
args->nfs_server.port = 0; /* autobind unless user sets port */
args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ args->auth_flavors[0] = RPC_AUTH_UNIX;
switch (data->version) {
case 1:
@@ -1289,7 +1575,9 @@ static int nfs_validate_mount_data(void *options,
args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
args->namlen = data->namlen;
args->bsize = data->bsize;
- args->auth_flavors[0] = data->pseudoflavor;
+
+ if (data->flags & NFS_MOUNT_SECFLAVOUR)
+ args->auth_flavors[0] = data->pseudoflavor;
if (!args->nfs_server.hostname)
goto out_nomem;
@@ -1321,8 +1609,6 @@ static int nfs_validate_mount_data(void *options,
break;
default: {
- unsigned int len;
- char *c;
int status;
if (nfs_parse_mount_options((char *)options, args) == 0)
@@ -1332,21 +1618,22 @@ static int nfs_validate_mount_data(void *options,
&args->nfs_server.address))
goto out_no_address;
- c = strchr(dev_name, ':');
- if (c == NULL)
- return -EINVAL;
- len = c - dev_name;
- /* N.B. caller will free nfs_server.hostname in all cases */
- args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
- if (!args->nfs_server.hostname)
- goto out_nomem;
+ nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+ args->nfs_server.port);
- c++;
- if (strlen(c) > NFS_MAXPATHLEN)
- return -ENAMETOOLONG;
- args->nfs_server.export_path = c;
+ nfs_set_mount_transport_protocol(args);
+
+ status = nfs_parse_devname(dev_name,
+ &args->nfs_server.hostname,
+ PAGE_SIZE,
+ &args->nfs_server.export_path,
+ NFS_MAXPATHLEN);
+ if (!status)
+ status = nfs_try_mount(args, mntfh);
+
+ kfree(args->nfs_server.export_path);
+ args->nfs_server.export_path = NULL;
- status = nfs_try_mount(args, mntfh);
if (status)
return status;
@@ -1354,9 +1641,6 @@ static int nfs_validate_mount_data(void *options,
}
}
- if (!(args->flags & NFS_MOUNT_SECFLAVOUR))
- args->auth_flavors[0] = RPC_AUTH_UNIX;
-
#ifndef CONFIG_NFS_V3
if (args->flags & NFS_MOUNT_VER3)
goto out_v3_not_compiled;
@@ -1396,6 +1680,80 @@ out_invalid_fh:
return -EINVAL;
}
+static int
+nfs_compare_remount_data(struct nfs_server *nfss,
+ struct nfs_parsed_mount_data *data)
+{
+ if (data->flags != nfss->flags ||
+ data->rsize != nfss->rsize ||
+ data->wsize != nfss->wsize ||
+ data->retrans != nfss->client->cl_timeout->to_retries ||
+ data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
+ data->acregmin != nfss->acregmin / HZ ||
+ data->acregmax != nfss->acregmax / HZ ||
+ data->acdirmin != nfss->acdirmin / HZ ||
+ data->acdirmax != nfss->acdirmax / HZ ||
+ data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+ data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
+ memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+ data->nfs_server.addrlen) != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+nfs_remount(struct super_block *sb, int *flags, char *raw_data)
+{
+ int error;
+ struct nfs_server *nfss = sb->s_fs_info;
+ struct nfs_parsed_mount_data *data;
+ struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data;
+ struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
+ u32 nfsvers = nfss->nfs_client->rpc_ops->version;
+
+ /*
+ * Userspace mount programs that send binary options generally send
+ * them populated with default values. We have no way to know which
+ * ones were explicitly specified. Fall back to legacy behavior and
+ * just return success.
+ */
+ if ((nfsvers == 4 && options4->version == 1) ||
+ (nfsvers <= 3 && options->version >= 1 &&
+ options->version <= 6))
+ return 0;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data == NULL)
+ return -ENOMEM;
+
+ /* fill out struct with values from existing mount */
+ data->flags = nfss->flags;
+ data->rsize = nfss->rsize;
+ data->wsize = nfss->wsize;
+ data->retrans = nfss->client->cl_timeout->to_retries;
+ data->auth_flavors[0] = nfss->client->cl_auth->au_flavor;
+ data->acregmin = nfss->acregmin / HZ;
+ data->acregmax = nfss->acregmax / HZ;
+ data->acdirmin = nfss->acdirmin / HZ;
+ data->acdirmax = nfss->acdirmax / HZ;
+ data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+ data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+ memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+ data->nfs_server.addrlen);
+
+ /* overwrite those values with any that were specified */
+ error = nfs_parse_mount_options((char *)options, data);
+ if (error < 0)
+ goto out;
+
+ /* compare new mount options with old ones */
+ error = nfs_compare_remount_data(nfss, data);
+out:
+ kfree(data);
+ return error;
+}
+
/*
* Initialise the common bits of the superblock
*/
@@ -1811,14 +2169,13 @@ static int nfs4_validate_mount_data(void *options,
args->rsize = NFS_MAX_FILE_IO_SIZE;
args->wsize = NFS_MAX_FILE_IO_SIZE;
- args->timeo = 600;
- args->retrans = 2;
- args->acregmin = 3;
- args->acregmax = 60;
- args->acdirmin = 30;
- args->acdirmax = 60;
+ args->acregmin = NFS_DEF_ACREGMIN;
+ args->acregmax = NFS_DEF_ACREGMAX;
+ args->acdirmin = NFS_DEF_ACDIRMIN;
+ args->acdirmax = NFS_DEF_ACDIRMAX;
args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */
- args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ args->auth_flavors[0] = RPC_AUTH_UNIX;
+ args->auth_flavor_len = 0;
switch (data->version) {
case 1:
@@ -1834,18 +2191,13 @@ static int nfs4_validate_mount_data(void *options,
&args->nfs_server.address))
goto out_no_address;
- switch (data->auth_flavourlen) {
- case 0:
- args->auth_flavors[0] = RPC_AUTH_UNIX;
- break;
- case 1:
+ if (data->auth_flavourlen) {
+ if (data->auth_flavourlen > 1)
+ goto out_inval_auth;
if (copy_from_user(&args->auth_flavors[0],
data->auth_flavours,
sizeof(args->auth_flavors[0])))
return -EFAULT;
- break;
- default:
- goto out_inval_auth;
}
c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
@@ -1879,10 +2231,11 @@ static int nfs4_validate_mount_data(void *options,
args->acdirmin = data->acdirmin;
args->acdirmax = data->acdirmax;
args->nfs_server.protocol = data->proto;
+ nfs_validate_transport_protocol(args);
break;
default: {
- unsigned int len;
+ int status;
if (nfs_parse_mount_options((char *)options, args) == 0)
return -EINVAL;
@@ -1891,44 +2244,25 @@ static int nfs4_validate_mount_data(void *options,
&args->nfs_server.address))
return -EINVAL;
- switch (args->auth_flavor_len) {
- case 0:
- args->auth_flavors[0] = RPC_AUTH_UNIX;
- break;
- case 1:
- break;
- default:
- goto out_inval_auth;
- }
+ nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+ args->nfs_server.port);
- /*
- * Split "dev_name" into "hostname:mntpath".
- */
- c = strchr(dev_name, ':');
- if (c == NULL)
- return -EINVAL;
- /* while calculating len, pretend ':' is '\0' */
- len = c - dev_name;
- if (len > NFS4_MAXNAMLEN)
- return -ENAMETOOLONG;
- /* N.B. caller will free nfs_server.hostname in all cases */
- args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
- if (!args->nfs_server.hostname)
- goto out_nomem;
-
- c++; /* step over the ':' */
- len = strlen(c);
- if (len > NFS4_MAXPATHLEN)
- return -ENAMETOOLONG;
- args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL);
- if (!args->nfs_server.export_path)
- goto out_nomem;
+ nfs_validate_transport_protocol(args);
- dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path);
+ if (args->auth_flavor_len > 1)
+ goto out_inval_auth;
if (args->client_address == NULL)
goto out_no_client_address;
+ status = nfs_parse_devname(dev_name,
+ &args->nfs_server.hostname,
+ NFS4_MAXNAMLEN,
+ &args->nfs_server.export_path,
+ NFS4_MAXPATHLEN);
+ if (status < 0)
+ return status;
+
break;
}
}
@@ -1944,10 +2278,6 @@ out_inval_auth:
data->auth_flavourlen);
return -EINVAL;
-out_nomem:
- dfprintk(MOUNT, "NFS4: not enough memory to handle mount options\n");
- return -ENOMEM;
-
out_no_address:
dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
return -EINVAL;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f333848fd3be..3229e217c773 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -34,9 +34,6 @@
/*
* Local function declarations
*/
-static struct nfs_page * nfs_update_request(struct nfs_open_context*,
- struct page *,
- unsigned int, unsigned int);
static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
struct inode *inode, int ioflags);
static void nfs_redirty_request(struct nfs_page *req);
@@ -136,16 +133,21 @@ static struct nfs_page *nfs_page_find_request(struct page *page)
static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
{
struct inode *inode = page->mapping->host;
- loff_t end, i_size = i_size_read(inode);
- pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+ loff_t end, i_size;
+ pgoff_t end_index;
+ spin_lock(&inode->i_lock);
+ i_size = i_size_read(inode);
+ end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
if (i_size > 0 && page->index < end_index)
- return;
+ goto out;
end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
if (i_size >= end)
- return;
- nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+ goto out;
i_size_write(inode, end);
+ nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+out:
+ spin_unlock(&inode->i_lock);
}
/* A writeback failed: mark the page as bad, and invalidate the page cache */
@@ -169,29 +171,6 @@ static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int
SetPageUptodate(page);
}
-static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
- unsigned int offset, unsigned int count)
-{
- struct nfs_page *req;
- int ret;
-
- for (;;) {
- req = nfs_update_request(ctx, page, offset, count);
- if (!IS_ERR(req))
- break;
- ret = PTR_ERR(req);
- if (ret != -EBUSY)
- return ret;
- ret = nfs_wb_page(page->mapping->host, page);
- if (ret != 0)
- return ret;
- }
- /* Update file length */
- nfs_grow_file(page, offset, count);
- nfs_clear_page_tag_locked(req);
- return 0;
-}
-
static int wb_priority(struct writeback_control *wbc)
{
if (wbc->for_reclaim)
@@ -268,12 +247,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
return ret;
spin_lock(&inode->i_lock);
}
- if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
- /* This request is marked for commit */
+ if (test_bit(PG_CLEAN, &req->wb_flags)) {
spin_unlock(&inode->i_lock);
- nfs_clear_page_tag_locked(req);
- nfs_pageio_complete(pgio);
- return 0;
+ BUG();
}
if (nfs_set_page_writeback(page) != 0) {
spin_unlock(&inode->i_lock);
@@ -355,11 +331,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
/*
* Insert a write request into an inode
*/
-static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
{
struct nfs_inode *nfsi = NFS_I(inode);
int error;
+ error = radix_tree_preload(GFP_NOFS);
+ if (error != 0)
+ goto out;
+
+ /* Lock the request! */
+ nfs_lock_request_dontget(req);
+
+ spin_lock(&inode->i_lock);
error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
BUG_ON(error);
if (!nfsi->npages) {
@@ -373,6 +357,10 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
kref_get(&req->wb_kref);
radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
NFS_PAGE_TAG_LOCKED);
+ spin_unlock(&inode->i_lock);
+ radix_tree_preload_end();
+out:
+ return error;
}
/*
@@ -405,19 +393,6 @@ nfs_mark_request_dirty(struct nfs_page *req)
__set_page_dirty_nobuffers(req->wb_page);
}
-/*
- * Check if a request is dirty
- */
-static inline int
-nfs_dirty_request(struct nfs_page *req)
-{
- struct page *page = req->wb_page;
-
- if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags))
- return 0;
- return !PageWriteback(page);
-}
-
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
/*
* Add a request to the inode's commit list.
@@ -430,7 +405,7 @@ nfs_mark_request_commit(struct nfs_page *req)
spin_lock(&inode->i_lock);
nfsi->ncommit++;
- set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+ set_bit(PG_CLEAN, &(req)->wb_flags);
radix_tree_tag_set(&nfsi->nfs_page_tree,
req->wb_index,
NFS_PAGE_TAG_COMMIT);
@@ -440,6 +415,19 @@ nfs_mark_request_commit(struct nfs_page *req)
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}
+static int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+ struct page *page = req->wb_page;
+
+ if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
+ dec_zone_page_state(page, NR_UNSTABLE_NFS);
+ dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+ return 1;
+ }
+ return 0;
+}
+
static inline
int nfs_write_need_commit(struct nfs_write_data *data)
{
@@ -449,7 +437,7 @@ int nfs_write_need_commit(struct nfs_write_data *data)
static inline
int nfs_reschedule_unstable_write(struct nfs_page *req)
{
- if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+ if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
nfs_mark_request_commit(req);
return 1;
}
@@ -465,6 +453,12 @@ nfs_mark_request_commit(struct nfs_page *req)
{
}
+static inline int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+ return 0;
+}
+
static inline
int nfs_write_need_commit(struct nfs_write_data *data)
{
@@ -522,11 +516,8 @@ static void nfs_cancel_commit_list(struct list_head *head)
while(!list_empty(head)) {
req = nfs_list_entry(head->next);
- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
- BDI_RECLAIMABLE);
nfs_list_remove_request(req);
- clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+ nfs_clear_request_commit(req);
nfs_inode_remove_request(req);
nfs_unlock_request(req);
}
@@ -564,110 +555,124 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pg
#endif
/*
- * Try to update any existing write request, or create one if there is none.
- * In order to match, the request's credentials must match those of
- * the calling process.
+ * Search for an existing write request, and attempt to update
+ * it to reflect a new dirty region on a given page.
*
- * Note: Should always be called with the Page Lock held!
+ * If the attempt fails, then the existing request is flushed out
+ * to disk.
*/
-static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
- struct page *page, unsigned int offset, unsigned int bytes)
+static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
+ struct page *page,
+ unsigned int offset,
+ unsigned int bytes)
{
- struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
- struct nfs_page *req, *new = NULL;
- pgoff_t rqend, end;
+ struct nfs_page *req;
+ unsigned int rqend;
+ unsigned int end;
+ int error;
+
+ if (!PagePrivate(page))
+ return NULL;
end = offset + bytes;
+ spin_lock(&inode->i_lock);
for (;;) {
- /* Loop over all inode entries and see if we find
- * A request for the page we wish to update
+ req = nfs_page_find_request_locked(page);
+ if (req == NULL)
+ goto out_unlock;
+
+ rqend = req->wb_offset + req->wb_bytes;
+ /*
+ * Tell the caller to flush out the request if
+ * the offsets are non-contiguous.
+ * Note: nfs_flush_incompatible() will already
+ * have flushed out requests having wrong owners.
*/
- if (new) {
- if (radix_tree_preload(GFP_NOFS)) {
- nfs_release_request(new);
- return ERR_PTR(-ENOMEM);
- }
- }
+ if (offset > rqend
+ || end < req->wb_offset)
+ goto out_flushme;
- spin_lock(&inode->i_lock);
- req = nfs_page_find_request_locked(page);
- if (req) {
- if (!nfs_set_page_tag_locked(req)) {
- int error;
-
- spin_unlock(&inode->i_lock);
- error = nfs_wait_on_request(req);
- nfs_release_request(req);
- if (error < 0) {
- if (new) {
- radix_tree_preload_end();
- nfs_release_request(new);
- }
- return ERR_PTR(error);
- }
- continue;
- }
- spin_unlock(&inode->i_lock);
- if (new) {
- radix_tree_preload_end();
- nfs_release_request(new);
- }
+ if (nfs_set_page_tag_locked(req))
break;
- }
- if (new) {
- nfs_lock_request_dontget(new);
- nfs_inode_add_request(inode, new);
- spin_unlock(&inode->i_lock);
- radix_tree_preload_end();
- req = new;
- goto zero_page;
- }
+ /* The request is locked, so wait and then retry */
spin_unlock(&inode->i_lock);
-
- new = nfs_create_request(ctx, inode, page, offset, bytes);
- if (IS_ERR(new))
- return new;
+ error = nfs_wait_on_request(req);
+ nfs_release_request(req);
+ if (error != 0)
+ goto out_err;
+ spin_lock(&inode->i_lock);
}
- /* We have a request for our page.
- * If the creds don't match, or the
- * page addresses don't match,
- * tell the caller to wait on the conflicting
- * request.
- */
- rqend = req->wb_offset + req->wb_bytes;
- if (req->wb_context != ctx
- || req->wb_page != page
- || !nfs_dirty_request(req)
- || offset > rqend || end < req->wb_offset) {
- nfs_clear_page_tag_locked(req);
- return ERR_PTR(-EBUSY);
- }
+ if (nfs_clear_request_commit(req))
+ radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
+ req->wb_index, NFS_PAGE_TAG_COMMIT);
/* Okay, the request matches. Update the region */
if (offset < req->wb_offset) {
req->wb_offset = offset;
req->wb_pgbase = offset;
- req->wb_bytes = max(end, rqend) - req->wb_offset;
- goto zero_page;
}
-
if (end > rqend)
req->wb_bytes = end - req->wb_offset;
-
+ else
+ req->wb_bytes = rqend - req->wb_offset;
+out_unlock:
+ spin_unlock(&inode->i_lock);
return req;
-zero_page:
- /* If this page might potentially be marked as up to date,
- * then we need to zero any uninitalised data. */
- if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE
- && !PageUptodate(req->wb_page))
- zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE);
+out_flushme:
+ spin_unlock(&inode->i_lock);
+ nfs_release_request(req);
+ error = nfs_wb_page(inode, page);
+out_err:
+ return ERR_PTR(error);
+}
+
+/*
+ * Try to update an existing write request, or create one if there is none.
+ *
+ * Note: Should always be called with the Page Lock held to prevent races
+ * if we have to add a new request. Also assumes that the caller has
+ * already called nfs_flush_incompatible() if necessary.
+ */
+static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
+ struct page *page, unsigned int offset, unsigned int bytes)
+{
+ struct inode *inode = page->mapping->host;
+ struct nfs_page *req;
+ int error;
+
+ req = nfs_try_to_update_request(inode, page, offset, bytes);
+ if (req != NULL)
+ goto out;
+ req = nfs_create_request(ctx, inode, page, offset, bytes);
+ if (IS_ERR(req))
+ goto out;
+ error = nfs_inode_add_request(inode, req);
+ if (error != 0) {
+ nfs_release_request(req);
+ req = ERR_PTR(error);
+ }
+out:
return req;
}
+static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
+ unsigned int offset, unsigned int count)
+{
+ struct nfs_page *req;
+
+ req = nfs_setup_write_request(ctx, page, offset, count);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ /* Update file length */
+ nfs_grow_file(page, offset, count);
+ nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+ nfs_clear_page_tag_locked(req);
+ return 0;
+}
+
int nfs_flush_incompatible(struct file *file, struct page *page)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
@@ -685,8 +690,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
req = nfs_page_find_request(page);
if (req == NULL)
return 0;
- do_flush = req->wb_page != page || req->wb_context != ctx
- || !nfs_dirty_request(req);
+ do_flush = req->wb_page != page || req->wb_context != ctx;
nfs_release_request(req);
if (!do_flush)
return 0;
@@ -721,10 +725,10 @@ int nfs_updatepage(struct file *file, struct page *page,
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
- dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n",
+ dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name, count,
- (long long)(page_offset(page) +offset));
+ (long long)(page_offset(page) + offset));
/* If we're not using byte range locks, and we know the page
* is up to date, it may be more efficient to extend the write
@@ -744,7 +748,7 @@ int nfs_updatepage(struct file *file, struct page *page,
else
__set_page_dirty_nobuffers(page);
- dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n",
+ dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
status, (long long)i_size_read(inode));
return status;
}
@@ -752,12 +756,7 @@ int nfs_updatepage(struct file *file, struct page *page,
static void nfs_writepage_release(struct nfs_page *req)
{
- if (PageError(req->wb_page)) {
- nfs_end_page_writeback(req->wb_page);
- nfs_inode_remove_request(req);
- } else if (!nfs_reschedule_unstable_write(req)) {
- /* Set the PG_uptodate flag */
- nfs_mark_uptodate(req->wb_page, req->wb_pgbase, req->wb_bytes);
+ if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
nfs_end_page_writeback(req->wb_page);
nfs_inode_remove_request(req);
} else
@@ -834,7 +833,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
NFS_PROTO(inode)->write_setup(data, &msg);
dprintk("NFS: %5u initiated write call "
- "(req %s/%Ld, %u bytes @ offset %Lu)\n",
+ "(req %s/%lld, %u bytes @ offset %llu)\n",
data->task.tk_pid,
inode->i_sb->s_id,
(long long)NFS_FILEID(inode),
@@ -978,13 +977,13 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
{
struct nfs_write_data *data = calldata;
- struct nfs_page *req = data->req;
- dprintk("NFS: write (%s/%Ld %d@%Ld)",
- req->wb_context->path.dentry->d_inode->i_sb->s_id,
- (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
- req->wb_bytes,
- (long long)req_offset(req));
+ dprintk("NFS: %5u write(%s/%lld %d@%lld)",
+ task->tk_pid,
+ data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
+ (long long)
+ NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
+ data->req->wb_bytes, (long long)req_offset(data->req));
nfs_writeback_done(task, data);
}
@@ -1058,7 +1057,8 @@ static void nfs_writeback_release_full(void *calldata)
nfs_list_remove_request(req);
- dprintk("NFS: write (%s/%Ld %d@%Ld)",
+ dprintk("NFS: %5u write (%s/%lld %d@%lld)",
+ data->task.tk_pid,
req->wb_context->path.dentry->d_inode->i_sb->s_id,
(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
req->wb_bytes,
@@ -1078,8 +1078,6 @@ static void nfs_writeback_release_full(void *calldata)
dprintk(" marked for commit\n");
goto next;
}
- /* Set the PG_uptodate flag? */
- nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
dprintk(" OK\n");
remove_request:
nfs_end_page_writeback(page);
@@ -1133,7 +1131,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
static unsigned long complain;
if (time_before(complain, jiffies)) {
- dprintk("NFS: faulty NFS server %s:"
+ dprintk("NFS: faulty NFS server %s:"
" (committed = %d) != (stable = %d)\n",
NFS_SERVER(data->inode)->nfs_client->cl_hostname,
resp->verf->committed, argp->stable);
@@ -1297,12 +1295,9 @@ static void nfs_commit_release(void *calldata)
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
- BDI_RECLAIMABLE);
+ nfs_clear_request_commit(req);
- dprintk("NFS: commit (%s/%Ld %d@%Ld)",
+ dprintk("NFS: commit (%s/%lld %d@%lld)",
req->wb_context->path.dentry->d_inode->i_sb->s_id,
(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
req->wb_bytes,
@@ -1318,9 +1313,6 @@ static void nfs_commit_release(void *calldata)
* returned by the server against all stored verfs. */
if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) {
/* We have a match */
- /* Set the PG_uptodate flag */
- nfs_mark_uptodate(req->wb_page, req->wb_pgbase,
- req->wb_bytes);
nfs_inode_remove_request(req);
dprintk(" OK\n");
goto next;
@@ -1479,7 +1471,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
req = nfs_page_find_request(page);
if (req == NULL)
goto out;
- if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+ if (test_bit(PG_CLEAN, &req->wb_flags)) {
nfs_release_request(req);
break;
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4d4760e687c3..702fa577aa6e 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -381,7 +381,7 @@ static int do_probe_callback(void *data)
.program = &cb_program,
.version = nfs_cb_version[1]->number,
.authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
- .flags = (RPC_CLNT_CREATE_NOPING),
+ .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
};
struct rpc_message msg = {
.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index c021280dd462..bd7e0f3acfc7 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
+#include <linux/smp_lock.h>
#include <linux/reboot.h>
#include <asm/uaccess.h>
@@ -619,10 +620,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
return -ENOMEM;
p->op_this_node = -1;
+ lock_kernel();
mutex_lock(&ocfs2_control_lock);
file->private_data = p;
list_add(&p->op_list, &ocfs2_control_private_list);
mutex_unlock(&ocfs2_control_lock);
+ unlock_kernel();
return 0;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b455371e7ff..58c3e6a8e15e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -233,7 +233,7 @@ static int check_mem_permission(struct task_struct *task)
*/
if (task->parent == current && (task->ptrace & PT_PTRACED) &&
task_is_stopped_or_traced(task) &&
- ptrace_may_attach(task))
+ ptrace_may_access(task, PTRACE_MODE_ATTACH))
return 0;
/*
@@ -251,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
task_lock(task);
if (task->mm != mm)
goto out;
- if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
+ if (task->mm != current->mm &&
+ __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
goto out;
task_unlock(task);
return mm;
@@ -518,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode)
*/
task = get_proc_task(inode);
if (task) {
- allowed = ptrace_may_attach(task);
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ);
put_task_struct(task);
}
return allowed;
@@ -904,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
if (!task)
goto out_no_task;
- if (!ptrace_may_attach(task))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out;
ret = -ENOMEM;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 7e277f2ad466..c652d469dc08 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off,
return proc_calc_metrics(page, start, off, count, eof, len);
}
+int __attribute__((weak)) arch_report_meminfo(char *page)
+{
+ return 0;
+}
+
static int meminfo_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
@@ -221,6 +226,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
len += hugetlb_report_meminfo(page + len);
+ len += arch_report_meminfo(page + len);
+
return proc_calc_metrics(page, start, off, count, eof, len);
#undef K
}
@@ -472,6 +479,13 @@ static const struct file_operations proc_vmalloc_operations = {
};
#endif
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
+
static int show_stat(struct seq_file *p, void *v)
{
int i;
@@ -509,7 +523,9 @@ static int show_stat(struct seq_file *p, void *v)
sum += temp;
per_irq_sum[j] += temp;
}
+ sum += arch_irq_stat_cpu(i);
}
+ sum += arch_irq_stat();
seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
(unsigned long long)cputime64_to_clock_t(user),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c492449f3b45..164bd9f9ede3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v)
dev_t dev = 0;
int len;
- if (maps_protect && !ptrace_may_attach(task))
+ if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
return -EACCES;
if (file) {
@@ -646,7 +646,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
goto out;
ret = -EACCES;
- if (!ptrace_may_attach(task))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out_task;
ret = -EINVAL;
@@ -747,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v)
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
- if (maps_protect && !ptrace_may_attach(task))
+ if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
return -EACCES;
return show_numa_map(m, v);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b4f9cc2f186..5d84e7121df8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml)
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
- if (maps_protect && !ptrace_may_attach(task))
+ if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
return -EACCES;
return nommu_vma_show(m, vml->vma);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 9590b9024300..78f613cb9c76 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = {
.mmap = generic_file_mmap,
.fsync = simple_sync_file,
.splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0989bc2c2f69..52312ec93ff4 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = {
.aio_write = generic_file_aio_write,
.fsync = simple_sync_file,
.splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/read_write.c b/fs/read_write.c
index f0d1240a5c69..9ba495d5a29b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = {
EXPORT_SYMBOL(generic_ro_fops);
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t
+generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
{
loff_t retval;
struct inode *inode = file->f_mapping->host;
- mutex_lock(&inode->i_mutex);
switch (origin) {
case SEEK_END:
offset += inode->i_size;
@@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
}
retval = -EINVAL;
if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+ /* Special lock needed here? */
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
}
retval = offset;
}
- mutex_unlock(&inode->i_mutex);
return retval;
}
+EXPORT_SYMBOL(generic_file_llseek_unlocked);
-EXPORT_SYMBOL(generic_file_llseek);
-
-loff_t remote_llseek(struct file *file, loff_t offset, int origin)
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
{
- loff_t retval;
-
- lock_kernel();
- switch (origin) {
- case SEEK_END:
- offset += i_size_read(file->f_path.dentry->d_inode);
- break;
- case SEEK_CUR:
- offset += file->f_pos;
- }
- retval = -EINVAL;
- if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
- retval = offset;
- }
- unlock_kernel();
- return retval;
+ loff_t n;
+ mutex_lock(&file->f_dentry->d_inode->i_mutex);
+ n = generic_file_llseek_unlocked(file, offset, origin);
+ mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+ return n;
}
-EXPORT_SYMBOL(remote_llseek);
+EXPORT_SYMBOL(generic_file_llseek);
loff_t no_llseek(struct file *file, loff_t offset, int origin)
{
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index efbe29af3d7a..2294783320cb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
return error;
}
+static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t ret;
+ lock_kernel();
+ ret = generic_file_llseek_unlocked(file, offset, origin);
+ unlock_kernel();
+ return ret;
+}
+
const struct file_operations smb_file_operations =
{
- .llseek = remote_llseek,
+ .llseek = smb_remote_llseek,
.read = do_sync_read,
.aio_read = smb_file_aio_read,
.write = do_sync_write,
diff --git a/fs/splice.c b/fs/splice.c
index aa5f6f60b305..399442179d89 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -379,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
lock_page(page);
/*
- * page was truncated, stop here. if this isn't the
- * first page, we'll just complete what we already
- * added
+ * Page was truncated, or invalidated by the
+ * filesystem. Redo the find/create, but this time the
+ * page is kept locked, so there's no chance of another
+ * race with truncate/invalidate.
*/
if (!page->mapping) {
unlock_page(page);
- break;
+ page = find_or_create_page(mapping, index,
+ mapping_gfp_mask(mapping));
+
+ if (!page) {
+ error = -ENOMEM;
+ break;
+ }
+ page_cache_release(pages[page_nr]);
+ pages[page_nr] = page;
}
/*
* page was already under io and is now done, great
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a3522727ea5b..b546ba69be82 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
if (len == 0)
return -ENOENT;
- slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL);
+ slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
if (slots == NULL)
return -ENOMEM;
@@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
struct dentry *alias;
int err, table;
- lock_kernel();
+ lock_super(sb);
table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
dentry->d_op = &vfat_dentry_ops[table];
@@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
brelse(sinfo.bh);
if (IS_ERR(inode)) {
- unlock_kernel();
+ unlock_super(sb);
return ERR_CAST(inode);
}
alias = d_find_alias(inode);
@@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
dput(alias);
else {
iput(inode);
- unlock_kernel();
+ unlock_super(sb);
return alias;
}
}
error:
- unlock_kernel();
+ unlock_super(sb);
dentry->d_op = &vfat_dentry_ops[table];
dentry->d_time = dentry->d_parent->d_inode->i_version;
dentry = d_splice_alias(inode, dentry);
@@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
struct timespec ts;
int err;
- lock_kernel();
+ lock_super(sb);
ts = CURRENT_TIME_SEC;
err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
dentry->d_time = dentry->d_parent->d_inode->i_version;
d_instantiate(dentry, inode);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
err = fat_dir_empty(inode);
if (err)
@@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -791,10 +792,11 @@ out:
static int vfat_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
err = vfat_find(dir, &dentry->d_name, &sinfo);
if (err)
@@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct timespec ts;
int err, cluster;
- lock_kernel();
+ lock_super(sb);
ts = CURRENT_TIME_SEC;
cluster = fat_alloc_new_dir(dir, &ts);
@@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
dentry->d_time = dentry->d_parent->d_inode->i_version;
d_instantiate(dentry, inode);
- unlock_kernel();
+ unlock_super(sb);
return 0;
out_free:
fat_free_clusters(dir, cluster);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
struct timespec ts;
loff_t dotdot_i_pos, new_i_pos;
int err, is_dir, update_dotdot, corrupt = 0;
+ struct super_block *sb = old_dir->i_sb;
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
- lock_kernel();
+ lock_super(sb);
err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
if (err)
goto out;
@@ -951,7 +954,7 @@ out:
brelse(sinfo.bh);
brelse(dotdot_bh);
brelse(old_sinfo.bh);
- unlock_kernel();
+ unlock_super(sb);
return err;