summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/userfaultfd.c148
-rw-r--r--include/linux/userfaultfd_k.h13
-rw-r--r--include/uapi/linux/userfaultfd.h15
-rw-r--r--kernel/fork.c10
4 files changed, 170 insertions, 16 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 87d31921b66c..6046e0b552b2 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -64,6 +64,12 @@ struct userfaultfd_ctx {
struct mm_struct *mm;
};
+struct userfaultfd_fork_ctx {
+ struct userfaultfd_ctx *orig;
+ struct userfaultfd_ctx *new;
+ struct list_head list;
+};
+
struct userfaultfd_wait_queue {
struct uffd_msg msg;
wait_queue_t wq;
@@ -465,9 +471,8 @@ out:
return ret;
}
-static int __maybe_unused userfaultfd_event_wait_completion(
- struct userfaultfd_ctx *ctx,
- struct userfaultfd_wait_queue *ewq)
+static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wait_queue *ewq)
{
int ret = 0;
@@ -518,6 +523,79 @@ static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
}
+int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
+{
+ struct userfaultfd_ctx *ctx = NULL, *octx;
+ struct userfaultfd_fork_ctx *fctx;
+
+ octx = vma->vm_userfaultfd_ctx.ctx;
+ if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+ return 0;
+ }
+
+ list_for_each_entry(fctx, fcs, list)
+ if (fctx->orig == octx) {
+ ctx = fctx->new;
+ break;
+ }
+
+ if (!ctx) {
+ fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
+ if (!fctx)
+ return -ENOMEM;
+
+ ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+ if (!ctx) {
+ kfree(fctx);
+ return -ENOMEM;
+ }
+
+ atomic_set(&ctx->refcount, 1);
+ ctx->flags = octx->flags;
+ ctx->state = UFFD_STATE_RUNNING;
+ ctx->features = octx->features;
+ ctx->released = false;
+ ctx->mm = vma->vm_mm;
+ atomic_inc(&ctx->mm->mm_users);
+
+ userfaultfd_ctx_get(octx);
+ fctx->orig = octx;
+ fctx->new = ctx;
+ list_add_tail(&fctx->list, fcs);
+ }
+
+ vma->vm_userfaultfd_ctx.ctx = ctx;
+ return 0;
+}
+
+static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
+{
+ struct userfaultfd_ctx *ctx = fctx->orig;
+ struct userfaultfd_wait_queue ewq;
+
+ msg_init(&ewq.msg);
+
+ ewq.msg.event = UFFD_EVENT_FORK;
+ ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
+
+ return userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void dup_userfaultfd_complete(struct list_head *fcs)
+{
+ int ret = 0;
+ struct userfaultfd_fork_ctx *fctx, *n;
+
+ list_for_each_entry_safe(fctx, n, fcs, list) {
+ if (!ret)
+ ret = dup_fctx(fctx);
+ list_del(&fctx->list);
+ kfree(fctx);
+ }
+}
+
static int userfaultfd_release(struct inode *inode, struct file *file)
{
struct userfaultfd_ctx *ctx = file->private_data;
@@ -653,12 +731,49 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
}
}
+static const struct file_operations userfaultfd_fops;
+
+static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_ctx *new,
+ struct uffd_msg *msg)
+{
+ int fd;
+ struct file *file;
+ unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
+
+ fd = get_unused_fd_flags(flags);
+ if (fd < 0)
+ return fd;
+
+ file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
+ O_RDWR | flags);
+ if (IS_ERR(file)) {
+ put_unused_fd(fd);
+ return PTR_ERR(file);
+ }
+
+ fd_install(fd, file);
+ msg->arg.reserved.reserved1 = 0;
+ msg->arg.fork.ufd = fd;
+
+ return 0;
+}
+
static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
struct uffd_msg *msg)
{
ssize_t ret;
DECLARE_WAITQUEUE(wait, current);
struct userfaultfd_wait_queue *uwq;
+ /*
+ * Handling fork event requires sleeping operations, so
+ * we drop the event_wqh lock, then do these ops, then
+ * lock it back and wake up the waiter. While the lock is
+ * dropped the ewq may go away so we keep track of it
+ * carefully.
+ */
+ LIST_HEAD(fork_event);
+ struct userfaultfd_ctx *fork_nctx = NULL;
/* always take the fd_wqh lock before the fault_pending_wqh lock */
spin_lock(&ctx->fd_wqh.lock);
@@ -716,6 +831,16 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
if (uwq) {
*msg = uwq->msg;
+ if (uwq->msg.event == UFFD_EVENT_FORK) {
+ fork_nctx = (struct userfaultfd_ctx *)
+ (unsigned long)
+ uwq->msg.arg.reserved.reserved1;
+ list_move(&uwq->wq.task_list, &fork_event);
+ spin_unlock(&ctx->event_wqh.lock);
+ ret = 0;
+ break;
+ }
+
userfaultfd_event_complete(ctx, uwq);
spin_unlock(&ctx->event_wqh.lock);
ret = 0;
@@ -739,6 +864,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
__set_current_state(TASK_RUNNING);
spin_unlock(&ctx->fd_wqh.lock);
+ if (!ret && msg->event == UFFD_EVENT_FORK) {
+ ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+
+ if (!ret) {
+ spin_lock(&ctx->event_wqh.lock);
+ if (!list_empty(&fork_event)) {
+ uwq = list_first_entry(&fork_event,
+ typeof(*uwq),
+ wq.task_list);
+ list_del(&uwq->wq.task_list);
+ __add_wait_queue(&ctx->event_wqh, &uwq->wq);
+ userfaultfd_event_complete(ctx, uwq);
+ }
+ spin_unlock(&ctx->event_wqh.lock);
+ }
+ }
+
return ret;
}
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 11b92b047a1e..79002bca1f43 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -52,6 +52,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
}
+extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
+extern void dup_userfaultfd_complete(struct list_head *);
+
#else /* CONFIG_USERFAULTFD */
/* mm helpers */
@@ -76,6 +79,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
return false;
}
+static inline int dup_userfaultfd(struct vm_area_struct *vma,
+ struct list_head *l)
+{
+ return 0;
+}
+
+static inline void dup_userfaultfd_complete(struct list_head *l)
+{
+}
+
#endif /* CONFIG_USERFAULTFD */
#endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 94046b8aa6ad..c8953c84fdcc 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,12 +18,7 @@
* means the userland is reading).
*/
#define UFFD_API ((__u64)0xAA)
-/*
- * After implementing the respective features it will become:
- * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
- * UFFD_FEATURE_EVENT_FORK)
- */
-#define UFFD_API_FEATURES (0)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -78,6 +73,10 @@ struct uffd_msg {
} pagefault;
struct {
+ __u32 ufd;
+ } fork;
+
+ struct {
/* unused reserved fields */
__u64 reserved1;
__u64 reserved2;
@@ -90,9 +89,7 @@ struct uffd_msg {
* Start at 0x12 and not at 0 to be more strict against bugs.
*/
#define UFFD_EVENT_PAGEFAULT 0x12
-#if 0 /* not available yet */
#define UFFD_EVENT_FORK 0x13
-#endif
/* flags for UFFD_EVENT_PAGEFAULT */
#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
@@ -111,10 +108,8 @@ struct uffdio_api {
* are to be considered implicitly always enabled in all kernels as
* long as the uffdio_api.api requested matches UFFD_API.
*/
-#if 0 /* not available yet */
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
-#endif
__u64 features;
__u64 ioctls;
diff --git a/kernel/fork.c b/kernel/fork.c
index ff82e24573b6..d12fcc4db8a3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -55,6 +55,7 @@
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
+#include <linux/userfaultfd_k.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
@@ -561,6 +562,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
+ LIST_HEAD(uf);
uprobe_start_dup_mmap();
if (down_write_killable(&oldmm->mmap_sem)) {
@@ -617,12 +619,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (retval)
goto fail_nomem_policy;
tmp->vm_mm = mm;
+ retval = dup_userfaultfd(tmp, &uf);
+ if (retval)
+ goto fail_nomem_anon_vma_fork;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &=
- ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
+ tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
tmp->vm_next = tmp->vm_prev = NULL;
- tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
if (file) {
struct inode *inode = file_inode(file);
@@ -678,6 +681,7 @@ out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
up_write(&oldmm->mmap_sem);
+ dup_userfaultfd_complete(&uf);
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;