summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/Kconfig7
-rw-r--r--fs/Makefile2
-rw-r--r--fs/exec.c1
-rw-r--r--fs/internal.h7
-rw-r--r--fs/libfs.c142
-rw-r--r--fs/nsfs.c121
-rw-r--r--fs/pidfs.c290
-rw-r--r--include/linux/ns_common.h2
-rw-r--r--include/linux/pid.h10
-rw-r--r--include/linux/pidfs.h9
-rw-r--r--include/linux/proc_ns.h2
-rw-r--r--include/linux/sched/signal.h2
-rw-r--r--include/uapi/linux/magic.h1
-rw-r--r--include/uapi/linux/pidfd.h8
-rw-r--r--init/main.c2
-rw-r--r--kernel/exit.c31
-rw-r--r--kernel/fork.c147
-rw-r--r--kernel/nsproxy.c2
-rw-r--r--kernel/pid.c57
-rw-r--r--kernel/signal.c110
-rw-r--r--tools/testing/selftests/pidfd/pidfd_getfd_test.c31
21 files changed, 686 insertions, 298 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index ea2f77446080..4bc7dd420874 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -173,6 +173,13 @@ source "fs/proc/Kconfig"
source "fs/kernfs/Kconfig"
source "fs/sysfs/Kconfig"
+config FS_PID
+ bool "Pseudo filesystem for process file descriptors"
+ depends on 64BIT
+ default y
+ help
+ Pidfs implements advanced features for process file descriptors.
+
config TMPFS
bool "Tmpfs virtual memory file system support (former shm fs)"
depends on SHMEM
diff --git a/fs/Makefile b/fs/Makefile
index c32b8c586800..6ecc9b0a53f2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -15,7 +15,7 @@ obj-y := open.o read_write.o file_table.o super.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
- kernel_read_file.o mnt_idmapping.o remap_range.o
+ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o
diff --git a/fs/exec.c b/fs/exec.c
index af4fbb61cd53..ece3ab0998e1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1158,7 +1158,6 @@ static int de_thread(struct task_struct *tsk)
BUG_ON(leader->exit_state != EXIT_ZOMBIE);
leader->exit_state = EXIT_DEAD;
-
/*
* We are going to release_task()->ptrace_unlink() silently,
* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
diff --git a/fs/internal.h b/fs/internal.h
index b67406435fc0..7d3edcdf59cc 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -310,3 +310,10 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po
struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
void mnt_idmap_put(struct mnt_idmap *idmap);
+struct stashed_operations {
+ void (*put_data)(void *data);
+ void (*init_inode)(struct inode *inode, void *data);
+};
+int path_from_stashed(struct dentry **stashed, unsigned long ino,
+ struct vfsmount *mnt, void *data, struct path *path);
+void stashed_dentry_prune(struct dentry *dentry);
diff --git a/fs/libfs.c b/fs/libfs.c
index 78c71a9e2e18..680c727d1bbc 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -23,6 +23,7 @@
#include <linux/fsnotify.h>
#include <linux/unicode.h>
#include <linux/fscrypt.h>
+#include <linux/pidfs.h>
#include <linux/uaccess.h>
@@ -1985,3 +1986,144 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
return ts;
}
EXPORT_SYMBOL(simple_inode_init_ts);
+
+static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
+{
+ struct dentry *dentry;
+
+ guard(rcu)();
+ dentry = READ_ONCE(stashed);
+ if (!dentry)
+ return NULL;
+ if (!lockref_get_not_dead(&dentry->d_lockref))
+ return NULL;
+ return dentry;
+}
+
+static struct dentry *prepare_anon_dentry(struct dentry **stashed,
+ unsigned long ino,
+ struct super_block *sb,
+ void *data)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+ const struct stashed_operations *sops = sb->s_fs_info;
+
+ dentry = d_alloc_anon(sb);
+ if (!dentry)
+ return ERR_PTR(-ENOMEM);
+
+ inode = new_inode_pseudo(sb);
+ if (!inode) {
+ dput(dentry);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ inode->i_ino = ino;
+ inode->i_flags |= S_IMMUTABLE;
+ inode->i_mode = S_IFREG;
+ simple_inode_init_ts(inode);
+ sops->init_inode(inode, data);
+
+ /* Notice when this is changed. */
+ WARN_ON_ONCE(!S_ISREG(inode->i_mode));
+ WARN_ON_ONCE(!IS_IMMUTABLE(inode));
+
+ /* Store address of location where dentry's supposed to be stashed. */
+ dentry->d_fsdata = stashed;
+
+ /* @data is now owned by the fs */
+ d_instantiate(dentry, inode);
+ return dentry;
+}
+
+static struct dentry *stash_dentry(struct dentry **stashed,
+ struct dentry *dentry)
+{
+ guard(rcu)();
+ for (;;) {
+ struct dentry *old;
+
+ /* Assume any old dentry was cleared out. */
+ old = cmpxchg(stashed, NULL, dentry);
+ if (likely(!old))
+ return dentry;
+
+ /* Check if somebody else installed a reusable dentry. */
+ if (lockref_get_not_dead(&old->d_lockref))
+ return old;
+
+ /* There's an old dead dentry there, try to take it over. */
+ if (likely(try_cmpxchg(stashed, &old, dentry)))
+ return dentry;
+ }
+}
+
+/**
+ * path_from_stashed - create path from stashed or new dentry
+ * @stashed: where to retrieve or stash dentry
+ * @ino: inode number to use
+ * @mnt: mnt of the filesystems to use
+ * @data: data to store in inode->i_private
+ * @path: path to create
+ *
+ * The function tries to retrieve a stashed dentry from @stashed. If the dentry
+ * is still valid then it will be reused. If the dentry isn't able the function
+ * will allocate a new dentry and inode. It will then check again whether it
+ * can reuse an existing dentry in case one has been added in the meantime or
+ * update @stashed with the newly added dentry.
+ *
+ * Special-purpose helper for nsfs and pidfs.
+ *
+ * Return: On success zero and on failure a negative error is returned.
+ */
+int path_from_stashed(struct dentry **stashed, unsigned long ino,
+ struct vfsmount *mnt, void *data, struct path *path)
+{
+ struct dentry *dentry;
+ const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
+
+ /* See if dentry can be reused. */
+ path->dentry = get_stashed_dentry(*stashed);
+ if (path->dentry) {
+ sops->put_data(data);
+ goto out_path;
+ }
+
+ /* Allocate a new dentry. */
+ dentry = prepare_anon_dentry(stashed, ino, mnt->mnt_sb, data);
+ if (IS_ERR(dentry)) {
+ sops->put_data(data);
+ return PTR_ERR(dentry);
+ }
+
+ /* Added a new dentry. @data is now owned by the filesystem. */
+ path->dentry = stash_dentry(stashed, dentry);
+ if (path->dentry != dentry)
+ dput(dentry);
+
+out_path:
+ WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
+ WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
+ path->mnt = mntget(mnt);
+ return 0;
+}
+
+void stashed_dentry_prune(struct dentry *dentry)
+{
+ struct dentry **stashed = dentry->d_fsdata;
+ struct inode *inode = d_inode(dentry);
+
+ if (WARN_ON_ONCE(!stashed))
+ return;
+
+ if (!inode)
+ return;
+
+ /*
+ * Only replace our own @dentry as someone else might've
+ * already cleared out @dentry and stashed their own
+ * dentry in there.
+ */
+ cmpxchg(stashed, dentry, NULL);
+}
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 34e1e3e36733..7aaafb5cb9fc 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -27,26 +27,17 @@ static const struct file_operations ns_file_operations = {
static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
{
struct inode *inode = d_inode(dentry);
- const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+ struct ns_common *ns = inode->i_private;
+ const struct proc_ns_operations *ns_ops = ns->ops;
return dynamic_dname(buffer, buflen, "%s:[%lu]",
ns_ops->name, inode->i_ino);
}
-static void ns_prune_dentry(struct dentry *dentry)
-{
- struct inode *inode = d_inode(dentry);
- if (inode) {
- struct ns_common *ns = inode->i_private;
- atomic_long_set(&ns->stashed, 0);
- }
-}
-
-const struct dentry_operations ns_dentry_operations =
-{
- .d_prune = ns_prune_dentry,
+const struct dentry_operations ns_dentry_operations = {
.d_delete = always_delete_dentry,
.d_dname = ns_dname,
+ .d_prune = stashed_dentry_prune,
};
static void nsfs_evict(struct inode *inode)
@@ -56,67 +47,16 @@ static void nsfs_evict(struct inode *inode)
ns->ops->put(ns);
}
-static int __ns_get_path(struct path *path, struct ns_common *ns)
-{
- struct vfsmount *mnt = nsfs_mnt;
- struct dentry *dentry;
- struct inode *inode;
- unsigned long d;
-
- rcu_read_lock();
- d = atomic_long_read(&ns->stashed);
- if (!d)
- goto slow;
- dentry = (struct dentry *)d;
- if (!lockref_get_not_dead(&dentry->d_lockref))
- goto slow;
- rcu_read_unlock();
- ns->ops->put(ns);
-got_it:
- path->mnt = mntget(mnt);
- path->dentry = dentry;
- return 0;
-slow:
- rcu_read_unlock();
- inode = new_inode_pseudo(mnt->mnt_sb);
- if (!inode) {
- ns->ops->put(ns);
- return -ENOMEM;
- }
- inode->i_ino = ns->inum;
- simple_inode_init_ts(inode);
- inode->i_flags |= S_IMMUTABLE;
- inode->i_mode = S_IFREG | S_IRUGO;
- inode->i_fop = &ns_file_operations;
- inode->i_private = ns;
-
- dentry = d_make_root(inode); /* not the normal use, but... */
- if (!dentry)
- return -ENOMEM;
- dentry->d_fsdata = (void *)ns->ops;
- d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
- if (d) {
- d_delete(dentry); /* make sure ->d_prune() does nothing */
- dput(dentry);
- cpu_relax();
- return -EAGAIN;
- }
- goto got_it;
-}
-
int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
void *private_data)
{
- int ret;
+ struct ns_common *ns;
- do {
- struct ns_common *ns = ns_get_cb(private_data);
- if (!ns)
- return -ENOENT;
- ret = __ns_get_path(path, ns);
- } while (ret == -EAGAIN);
+ ns = ns_get_cb(private_data);
+ if (!ns)
+ return -ENOENT;
- return ret;
+ return path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt, ns, path);
}
struct ns_get_path_task_args {
@@ -146,6 +86,7 @@ int open_related_ns(struct ns_common *ns,
struct ns_common *(*get_ns)(struct ns_common *ns))
{
struct path path = {};
+ struct ns_common *relative;
struct file *f;
int err;
int fd;
@@ -154,19 +95,15 @@ int open_related_ns(struct ns_common *ns,
if (fd < 0)
return fd;
- do {
- struct ns_common *relative;
-
- relative = get_ns(ns);
- if (IS_ERR(relative)) {
- put_unused_fd(fd);
- return PTR_ERR(relative);
- }
-
- err = __ns_get_path(&path, relative);
- } while (err == -EAGAIN);
+ relative = get_ns(ns);
+ if (IS_ERR(relative)) {
+ put_unused_fd(fd);
+ return PTR_ERR(relative);
+ }
- if (err) {
+ err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt,
+ relative, &path);
+ if (err < 0) {
put_unused_fd(fd);
return err;
}
@@ -249,7 +186,8 @@ bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+ const struct ns_common *ns = inode->i_private;
+ const struct proc_ns_operations *ns_ops = ns->ops;
seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
return 0;
@@ -261,6 +199,24 @@ static const struct super_operations nsfs_ops = {
.show_path = nsfs_show_path,
};
+static void nsfs_init_inode(struct inode *inode, void *data)
+{
+ inode->i_private = data;
+ inode->i_mode |= S_IRUGO;
+ inode->i_fop = &ns_file_operations;
+}
+
+static void nsfs_put_data(void *data)
+{
+ struct ns_common *ns = data;
+ ns->ops->put(ns);
+}
+
+static const struct stashed_operations nsfs_stashed_ops = {
+ .init_inode = nsfs_init_inode,
+ .put_data = nsfs_put_data,
+};
+
static int nsfs_init_fs_context(struct fs_context *fc)
{
struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
@@ -268,6 +224,7 @@ static int nsfs_init_fs_context(struct fs_context *fc)
return -ENOMEM;
ctx->ops = &nsfs_ops;
ctx->dops = &ns_dentry_operations;
+ fc->s_fs_info = (void *)&nsfs_stashed_ops;
return 0;
}
diff --git a/fs/pidfs.c b/fs/pidfs.c
new file mode 100644
index 000000000000..8fd71a00be9c
--- /dev/null
+++ b/fs/pidfs.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <linux/mount.h>
+#include <linux/pid.h>
+#include <linux/pidfs.h>
+#include <linux/pid_namespace.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/pseudo_fs.h>
+#include <linux/seq_file.h>
+#include <uapi/linux/pidfd.h>
+
+#include "internal.h"
+
+static int pidfd_release(struct inode *inode, struct file *file)
+{
+#ifndef CONFIG_FS_PID
+ struct pid *pid = file->private_data;
+
+ file->private_data = NULL;
+ put_pid(pid);
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+/**
+ * pidfd_show_fdinfo - print information about a pidfd
+ * @m: proc fdinfo file
+ * @f: file referencing a pidfd
+ *
+ * Pid:
+ * This function will print the pid that a given pidfd refers to in the
+ * pid namespace of the procfs instance.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its pid. This is
+ * similar to calling getppid() on a process whose parent is outside of
+ * its pid namespace.
+ *
+ * NSpid:
+ * If pid namespaces are supported then this function will also print
+ * the pid of a given pidfd refers to for all descendant pid namespaces
+ * starting from the current pid namespace of the instance, i.e. the
+ * Pid field and the first entry in the NSpid field will be identical.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its first NSpid
+ * entry and no others will be shown.
+ * Note that this differs from the Pid and NSpid fields in
+ * /proc/<pid>/status where Pid and NSpid are always shown relative to
+ * the pid namespace of the procfs instance. The difference becomes
+ * obvious when sending around a pidfd between pid namespaces from a
+ * different branch of the tree, i.e. where no ancestral relation is
+ * present between the pid namespaces:
+ * - create two new pid namespaces ns1 and ns2 in the initial pid
+ * namespace (also take care to create new mount namespaces in the
+ * new pid namespace and mount procfs)
+ * - create a process with a pidfd in ns1
+ * - send pidfd from ns1 to ns2
+ * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
+ * have exactly one entry, which is 0
+ */
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct pid *pid = pidfd_pid(f);
+ struct pid_namespace *ns;
+ pid_t nr = -1;
+
+ if (likely(pid_has_task(pid, PIDTYPE_PID))) {
+ ns = proc_pid_ns(file_inode(m->file)->i_sb);
+ nr = pid_nr_ns(pid, ns);
+ }
+
+ seq_put_decimal_ll(m, "Pid:\t", nr);
+
+#ifdef CONFIG_PID_NS
+ seq_put_decimal_ll(m, "\nNSpid:\t", nr);
+ if (nr > 0) {
+ int i;
+
+ /* If nr is non-zero it means that 'pid' is valid and that
+ * ns, i.e. the pid namespace associated with the procfs
+ * instance, is in the pid namespace hierarchy of pid.
+ * Start at one below the already printed level.
+ */
+ for (i = ns->level + 1; i <= pid->level; i++)
+ seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
+ }
+#endif
+ seq_putc(m, '\n');
+}
+#endif
+
+/*
+ * Poll support for process exit notification.
+ */
+static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct pid *pid = pidfd_pid(file);
+ bool thread = file->f_flags & PIDFD_THREAD;
+ struct task_struct *task;
+ __poll_t poll_flags = 0;
+
+ poll_wait(file, &pid->wait_pidfd, pts);
+ /*
+ * Depending on PIDFD_THREAD, inform pollers when the thread
+ * or the whole thread-group exits.
+ */
+ guard(rcu)();
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
+ else if (task->exit_state && (thread || thread_group_empty(task)))
+ poll_flags = EPOLLIN | EPOLLRDNORM;
+
+ return poll_flags;
+}
+
+static const struct file_operations pidfs_file_operations = {
+ .release = pidfd_release,
+ .poll = pidfd_poll,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = pidfd_show_fdinfo,
+#endif
+};
+
+struct pid *pidfd_pid(const struct file *file)
+{
+ if (file->f_op != &pidfs_file_operations)
+ return ERR_PTR(-EBADF);
+#ifdef CONFIG_FS_PID
+ return file_inode(file)->i_private;
+#else
+ return file->private_data;
+#endif
+}
+
+#ifdef CONFIG_FS_PID
+static struct vfsmount *pidfs_mnt __ro_after_init;
+
+/*
+ * The vfs falls back to simple_setattr() if i_op->setattr() isn't
+ * implemented. Let's reject it completely until we have a clean
+ * permission concept for pidfds.
+ */
+static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
+static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ return 0;
+}
+
+static const struct inode_operations pidfs_inode_operations = {
+ .getattr = pidfs_getattr,
+ .setattr = pidfs_setattr,
+};
+
+static void pidfs_evict_inode(struct inode *inode)
+{
+ struct pid *pid = inode->i_private;
+
+ clear_inode(inode);
+ put_pid(pid);
+}
+
+static const struct super_operations pidfs_sops = {
+ .drop_inode = generic_delete_inode,
+ .evict_inode = pidfs_evict_inode,
+ .statfs = simple_statfs,
+};
+
+static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+ return dynamic_dname(buffer, buflen, "pidfd:[%lu]",
+ d_inode(dentry)->i_ino);
+}
+
+static const struct dentry_operations pidfs_dentry_operations = {
+ .d_delete = always_delete_dentry,
+ .d_dname = pidfs_dname,
+ .d_prune = stashed_dentry_prune,
+};
+
+static void pidfs_init_inode(struct inode *inode, void *data)
+{
+ inode->i_private = data;
+ inode->i_flags |= S_PRIVATE;
+ inode->i_mode |= S_IRWXU;
+ inode->i_op = &pidfs_inode_operations;
+ inode->i_fop = &pidfs_file_operations;
+}
+
+static void pidfs_put_data(void *data)
+{
+ struct pid *pid = data;
+ put_pid(pid);
+}
+
+static const struct stashed_operations pidfs_stashed_ops = {
+ .init_inode = pidfs_init_inode,
+ .put_data = pidfs_put_data,
+};
+
+static int pidfs_init_fs_context(struct fs_context *fc)
+{
+ struct pseudo_fs_context *ctx;
+
+ ctx = init_pseudo(fc, PID_FS_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ops = &pidfs_sops;
+ ctx->dops = &pidfs_dentry_operations;
+ fc->s_fs_info = (void *)&pidfs_stashed_ops;
+ return 0;
+}
+
+static struct file_system_type pidfs_type = {
+ .name = "pidfs",
+ .init_fs_context = pidfs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+
+ struct file *pidfd_file;
+ struct path path;
+ int ret;
+
+ /*
+ * Inode numbering for pidfs start at RESERVED_PIDS + 1.
+ * This avoids collisions with the root inode which is 1
+ * for pseudo filesystems.
+ */
+ ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt,
+ get_pid(pid), &path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ pidfd_file = dentry_open(&path, flags, current_cred());
+ path_put(&path);
+ return pidfd_file;
+}
+
+void __init pidfs_init(void)
+{
+ pidfs_mnt = kern_mount(&pidfs_type);
+ if (IS_ERR(pidfs_mnt))
+ panic("Failed to mount pidfs pseudo filesystem");
+}
+
+bool is_pidfs_sb(const struct super_block *sb)
+{
+ return sb == pidfs_mnt->mnt_sb;
+}
+
+#else /* !CONFIG_FS_PID */
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+ struct file *pidfd_file;
+
+ pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid,
+ flags | O_RDWR);
+ if (IS_ERR(pidfd_file))
+ return pidfd_file;
+
+ get_pid(pid);
+ return pidfd_file;
+}
+
+void __init pidfs_init(void) { }
+bool is_pidfs_sb(const struct super_block *sb)
+{
+ return false;
+}
+#endif
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 0f1d024bd958..7d22ea50b098 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -7,7 +7,7 @@
struct proc_ns_operations;
struct ns_common {
- atomic_long_t stashed;
+ struct dentry *stashed;
const struct proc_ns_operations *ops;
unsigned int inum;
refcount_t count;
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 395cacce1179..c79a0efd0258 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -55,6 +55,10 @@ struct pid
refcount_t count;
unsigned int level;
spinlock_t lock;
+#ifdef CONFIG_FS_PID
+ struct dentry *stashed;
+ unsigned long ino;
+#endif
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
struct hlist_head inodes;
@@ -66,15 +70,13 @@ struct pid
extern struct pid init_struct_pid;
-extern const struct file_operations pidfd_fops;
-
struct file;
-extern struct pid *pidfd_pid(const struct file *file);
+struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
-int pidfd_create(struct pid *pid, unsigned int flags);
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
+void do_notify_pidfd(struct task_struct *task);
static inline struct pid *get_pid(struct pid *pid)
{
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
new file mode 100644
index 000000000000..40dd325a32a6
--- /dev/null
+++ b/include/linux/pidfs.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PID_FS_H
+#define _LINUX_PID_FS_H
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
+void __init pidfs_init(void);
+bool is_pidfs_sb(const struct super_block *sb);
+
+#endif /* _LINUX_PID_FS_H */
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 49539bc416ce..5ea470eb4d76 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -66,7 +66,7 @@ static inline void proc_free_inum(unsigned int inum) {}
static inline int ns_alloc_inum(struct ns_common *ns)
{
- atomic_long_set(&ns->stashed, 0);
+ WRITE_ONCE(ns->stashed, NULL);
return proc_alloc_inum(&ns->inum);
}
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 4b7664c56208..0a0e23c45406 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -735,8 +735,6 @@ static inline int thread_group_empty(struct task_struct *p)
#define delay_group_leader(p) \
(thread_group_leader(p) && !thread_group_empty(p))
-extern bool thread_group_exited(struct pid *pid);
-
extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
unsigned long *flags);
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 6325d1d0e90f..1b40a968ba91 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -101,5 +101,6 @@
#define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */
#define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
+#define PID_FS_MAGIC 0x50494446 /* "PIDF" */
#endif /* __LINUX_MAGIC_H__ */
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 5406fbc13074..72ec000a97cd 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -7,6 +7,12 @@
#include <linux/fcntl.h>
/* Flags for pidfd_open(). */
-#define PIDFD_NONBLOCK O_NONBLOCK
+#define PIDFD_NONBLOCK O_NONBLOCK
+#define PIDFD_THREAD O_EXCL
+
+/* Flags for pidfd_send_signal(). */
+#define PIDFD_SIGNAL_THREAD (1UL << 0)
+#define PIDFD_SIGNAL_THREAD_GROUP (1UL << 1)
+#define PIDFD_SIGNAL_PROCESS_GROUP (1UL << 2)
#endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/init/main.c b/init/main.c
index e24b0780fdff..2fbf6a3114d5 100644
--- a/init/main.c
+++ b/init/main.c
@@ -99,6 +99,7 @@
#include <linux/init_syscalls.h>
#include <linux/stackdepot.h>
#include <linux/randomize_kstack.h>
+#include <linux/pidfs.h>
#include <net/net_namespace.h>
#include <asm/io.h>
@@ -1059,6 +1060,7 @@ void start_kernel(void)
seq_file_init();
proc_root_init();
nsfs_init();
+ pidfs_init();
cpuset_init();
cgroup_init();
taskstats_init_early();
diff --git a/kernel/exit.c b/kernel/exit.c
index dfb963d2f862..41a12630cbbc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
tsk->exit_state = EXIT_ZOMBIE;
+ /*
+ * sub-thread or delay_group_leader(), wake up the
+ * PIDFD_THREAD waiters.
+ */
+ if (!thread_group_empty(tsk))
+ do_notify_pidfd(tsk);
+
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
@@ -1889,30 +1896,6 @@ Efault:
}
#endif
-/**
- * thread_group_exited - check that a thread group has exited
- * @pid: tgid of thread group to be checked.
- *
- * Test if the thread group represented by tgid has exited (all
- * threads are zombies, dead or completely gone).
- *
- * Return: true if the thread group has exited. false otherwise.
- */
-bool thread_group_exited(struct pid *pid)
-{
- struct task_struct *task;
- bool exited;
-
- rcu_read_lock();
- task = pid_task(pid, PIDTYPE_PID);
- exited = !task ||
- (READ_ONCE(task->exit_state) && thread_group_empty(task));
- rcu_read_unlock();
-
- return exited;
-}
-EXPORT_SYMBOL(thread_group_exited);
-
/*
* This needs to be __function_aligned as GCC implicitly makes any
* implementation of abort() cold and drops alignment specified by
diff --git a/kernel/fork.c b/kernel/fork.c
index 0d944e92a43f..1af8dfd149ee 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -101,6 +101,8 @@
#include <linux/user_events.h>
#include <linux/iommu.h>
#include <linux/rseq.h>
+#include <uapi/linux/pidfd.h>
+#include <linux/pidfs.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -1985,119 +1987,6 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
-struct pid *pidfd_pid(const struct file *file)
-{
- if (file->f_op == &pidfd_fops)
- return file->private_data;
-
- return ERR_PTR(-EBADF);
-}
-
-static int pidfd_release(struct inode *inode, struct file *file)
-{
- struct pid *pid = file->private_data;
-
- file->private_data = NULL;
- put_pid(pid);
- return 0;
-}
-
-#ifdef CONFIG_PROC_FS
-/**
- * pidfd_show_fdinfo - print information about a pidfd
- * @m: proc fdinfo file
- * @f: file referencing a pidfd
- *
- * Pid:
- * This function will print the pid that a given pidfd refers to in the
- * pid namespace of the procfs instance.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its pid. This is
- * similar to calling getppid() on a process whose parent is outside of
- * its pid namespace.
- *
- * NSpid:
- * If pid namespaces are supported then this function will also print
- * the pid of a given pidfd refers to for all descendant pid namespaces
- * starting from the current pid namespace of the instance, i.e. the
- * Pid field and the first entry in the NSpid field will be identical.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its first NSpid
- * entry and no others will be shown.
- * Note that this differs from the Pid and NSpid fields in
- * /proc/<pid>/status where Pid and NSpid are always shown relative to
- * the pid namespace of the procfs instance. The difference becomes
- * obvious when sending around a pidfd between pid namespaces from a
- * different branch of the tree, i.e. where no ancestral relation is
- * present between the pid namespaces:
- * - create two new pid namespaces ns1 and ns2 in the initial pid
- * namespace (also take care to create new mount namespaces in the
- * new pid namespace and mount procfs)
- * - create a process with a pidfd in ns1
- * - send pidfd from ns1 to ns2
- * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
- * have exactly one entry, which is 0
- */
-static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
-{
- struct pid *pid = f->private_data;
- struct pid_namespace *ns;
- pid_t nr = -1;
-
- if (likely(pid_has_task(pid, PIDTYPE_PID))) {
- ns = proc_pid_ns(file_inode(m->file)->i_sb);
- nr = pid_nr_ns(pid, ns);
- }
-
- seq_put_decimal_ll(m, "Pid:\t", nr);
-
-#ifdef CONFIG_PID_NS
- seq_put_decimal_ll(m, "\nNSpid:\t", nr);
- if (nr > 0) {
- int i;
-
- /* If nr is non-zero it means that 'pid' is valid and that
- * ns, i.e. the pid namespace associated with the procfs
- * instance, is in the pid namespace hierarchy of pid.
- * Start at one below the already printed level.
- */
- for (i = ns->level + 1; i <= pid->level; i++)
- seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
- }
-#endif
- seq_putc(m, '\n');
-}
-#endif
-
-/*
- * Poll support for process exit notification.
- */
-static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
-{
- struct pid *pid = file->private_data;
- __poll_t poll_flags = 0;
-
- poll_wait(file, &pid->wait_pidfd, pts);
-
- /*
- * Inform pollers only when the whole thread group exits.
- * If the thread group leader exits before all other threads in the
- * group, then poll(2) should block, similar to the wait(2) family.
- */
- if (thread_group_exited(pid))
- poll_flags = EPOLLIN | EPOLLRDNORM;
-
- return poll_flags;
-}
-
-const struct file_operations pidfd_fops = {
- .release = pidfd_release,
- .poll = pidfd_poll,
-#ifdef CONFIG_PROC_FS
- .show_fdinfo = pidfd_show_fdinfo,
-#endif
-};
-
/**
* __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
@@ -2131,20 +2020,20 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
int pidfd;
struct file *pidfd_file;
- if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
- return -EINVAL;
-
- pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ pidfd = get_unused_fd_flags(O_CLOEXEC);
if (pidfd < 0)
return pidfd;
- pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- flags | O_RDWR | O_CLOEXEC);
+ pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
if (IS_ERR(pidfd_file)) {
put_unused_fd(pidfd);
return PTR_ERR(pidfd_file);
}
- get_pid(pid); /* held by pidfd_file now */
+ /*
+ * anon_inode_getfile() ignores everything outside of the
+ * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
+ */
+ pidfd_file->f_flags |= (flags & PIDFD_THREAD);
*ret = pidfd_file;
return pidfd;
}
@@ -2158,7 +2047,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
*
- * The helper verifies that @pid is used as a thread group leader.
+ * The helper verifies that @pid is still in use, without PIDFD_THREAD the
+ * task identified by @pid must be a thread-group leader.
*
* If this function returns successfully the caller is responsible to either
* call fd_install() passing the returned pidfd and pidfd file as arguments in
@@ -2177,7 +2067,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
*/
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
- if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ bool thread = flags & PIDFD_THREAD;
+
+ if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
return -EINVAL;
return __pidfd_prepare(pid, flags, ret);
@@ -2299,9 +2191,8 @@ __latent_entropy struct task_struct *copy_process(
/*
* - CLONE_DETACHED is blocked so that we can potentially
* reuse it later for CLONE_PIDFD.
- * - CLONE_THREAD is blocked until someone really needs it.
*/
- if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
+ if (clone_flags & CLONE_DETACHED)
return ERR_PTR(-EINVAL);
}
@@ -2524,8 +2415,10 @@ __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
+ int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+
/* Note that no task has been attached to @pid yet. */
- retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
+ retval = __pidfd_prepare(pid, flags, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
@@ -2876,8 +2769,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
* here has the advantage that we don't need to have a separate helper
* to check for legacy clone().
*/
- if ((args->flags & CLONE_PIDFD) &&
- (args->flags & CLONE_PARENT_SETTID) &&
+ if ((clone_flags & CLONE_PIDFD) &&
+ (clone_flags & CLONE_PARENT_SETTID) &&
(args->pidfd == args->parent_tid))
return -EINVAL;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 15781acaac1c..6ec3deec68c2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -573,7 +573,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
if (proc_ns_file(f.file))
err = validate_ns(&nsset, ns);
else
- err = validate_nsset(&nsset, f.file->private_data);
+ err = validate_nsset(&nsset, pidfd_pid(f.file));
if (!err) {
commit_nsset(&nsset);
perf_event_namespaces(current);
diff --git a/kernel/pid.c b/kernel/pid.c
index b52b10865454..99a0c5eb24b8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -42,6 +42,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
+#include <linux/pidfs.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>
@@ -65,6 +66,13 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
+#ifdef CONFIG_FS_PID
+/*
+ * Pseudo filesystems start inode numbering after one. We use Reserved
+ * PIDs as a natural offset.
+ */
+static u64 pidfs_ino = RESERVED_PIDS;
+#endif
/*
* PID-map pages start out as NULL, they get allocated upon
@@ -272,6 +280,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
+#ifdef CONFIG_FS_PID
+ pid->stashed = NULL;
+ pid->ino = ++pidfs_ino;
+#endif
for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
@@ -349,6 +361,11 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
hlist_del_rcu(&task->pid_links[type]);
*pid_ptr = new;
+ if (type == PIDTYPE_PID) {
+ WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
+ wake_up_all(&pid->wait_pidfd);
+ }
+
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (pid_has_task(pid, tmp))
return;
@@ -391,8 +408,7 @@ void exchange_tids(struct task_struct *left, struct task_struct *right)
void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
- if (type == PIDTYPE_PID)
- new->thread_pid = old->thread_pid;
+ WARN_ON_ONCE(type == PIDTYPE_PID);
hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}
@@ -552,11 +568,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
* Return the task associated with @pidfd. The function takes a reference on
* the returned task. The caller is responsible for releasing that reference.
*
- * Currently, the process identified by @pidfd is always a thread-group leader.
- * This restriction currently exists for all aspects of pidfds including pidfd
- * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
- * (only supports thread group leaders).
- *
* Return: On success, the task_struct associated with the pidfd.
* On error, a negative errno number will be returned.
*/
@@ -595,7 +606,7 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
-int pidfd_create(struct pid *pid, unsigned int flags)
+static int pidfd_create(struct pid *pid, unsigned int flags)
{
int pidfd;
struct file *pidfd_file;
@@ -615,11 +626,8 @@ int pidfd_create(struct pid *pid, unsigned int flags)
* @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set for
- * the process identified by @pid. Currently, the process identified by
- * @pid must be a thread-group leader. This restriction currently exists
- * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
- * be used with CLONE_THREAD) and pidfd polling (only supports thread group
- * leaders).
+ * the task identified by @pid. Without PIDFD_THREAD flag the target task
+ * must be a thread-group leader.
*
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
@@ -629,7 +637,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
int fd;
struct pid *p;
- if (flags & ~PIDFD_NONBLOCK)
+ if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
return -EINVAL;
if (pid <= 0)
@@ -682,7 +690,26 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
up_read(&task->signal->exec_update_lock);
- return file ?: ERR_PTR(-EBADF);
+ if (!file) {
+ /*
+ * It is possible that the target thread is exiting; it can be
+ * either:
+ * 1. before exit_signals(), which gives a real fd
+ * 2. before exit_files() takes the task_lock() gives a real fd
+ * 3. after exit_files() releases task_lock(), ->files is NULL;
+ * this has PF_EXITING, since it was set in exit_signals(),
+ * __pidfd_fget() returns EBADF.
+ * In case 3 we get EBADF, but that really means ESRCH, since
+ * the task is currently exiting and has freed its files
+ * struct, so we fix it up.
+ */
+ if (task->flags & PF_EXITING)
+ file = ERR_PTR(-ESRCH);
+ else
+ file = ERR_PTR(-EBADF);
+ }
+
+ return file;
}
static int pidfd_getfd(struct pid *pid, int fd)
diff --git a/kernel/signal.c b/kernel/signal.c
index c9c57d053ce4..bdca529f0f7b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -47,6 +47,7 @@
#include <linux/cgroup.h>
#include <linux/audit.h>
#include <linux/sysctl.h>
+#include <uapi/linux/pidfd.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -1436,7 +1437,8 @@ void lockdep_assert_task_sighand_held(struct task_struct *task)
#endif
/*
- * send signal info to all the members of a group
+ * send signal info to all the members of a thread group or to the
+ * individual thread if type == PIDTYPE_PID.
*/
int group_send_sig_info(int sig, struct kernel_siginfo *info,
struct task_struct *p, enum pid_type type)
@@ -1478,7 +1480,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
return ret;
}
-int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
+static int kill_pid_info_type(int sig, struct kernel_siginfo *info,
+ struct pid *pid, enum pid_type type)
{
int error = -ESRCH;
struct task_struct *p;
@@ -1487,11 +1490,10 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
rcu_read_lock();
p = pid_task(pid, PIDTYPE_PID);
if (p)
- error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
+ error = group_send_sig_info(sig, info, p, type);
rcu_read_unlock();
if (likely(!p || error != -ESRCH))
return error;
-
/*
* The task was unhashed in between, try again. If it
* is dead, pid_task() will return NULL, if we race with
@@ -1500,6 +1502,11 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
}
}
+int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
+{
+ return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID);
+}
+
static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
int error;
@@ -1898,16 +1905,19 @@ int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
return send_sig_info(info.si_signo, &info, t);
}
-int kill_pgrp(struct pid *pid, int sig, int priv)
+static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
int ret;
-
read_lock(&tasklist_lock);
- ret = __kill_pgrp_info(sig, __si_special(priv), pid);
+ ret = __kill_pgrp_info(sig, info, pgrp);
read_unlock(&tasklist_lock);
-
return ret;
}
+
+int kill_pgrp(struct pid *pid, int sig, int priv)
+{
+ return kill_pgrp_info(sig, __si_special(priv), pid);
+}
EXPORT_SYMBOL(kill_pgrp);
int kill_pid(struct pid *pid, int sig, int priv)
@@ -2019,13 +2029,14 @@ ret:
return ret;
}
-static void do_notify_pidfd(struct task_struct *task)
+void do_notify_pidfd(struct task_struct *task)
{
- struct pid *pid;
+ struct pid *pid = task_pid(task);
WARN_ON(task->exit_state == 0);
- pid = task_pid(task);
- wake_up_all(&pid->wait_pidfd);
+
+ __wake_up(&pid->wait_pidfd, TASK_NORMAL, 0,
+ poll_to_key(EPOLLIN | EPOLLRDNORM));
}
/*
@@ -2050,9 +2061,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
WARN_ON_ONCE(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
-
- /* Wake up all pidfd waiters */
- do_notify_pidfd(tsk);
+ /*
+ * tsk is a group leader and has no threads, wake up the
+ * non-PIDFD_THREAD waiters.
+ */
+ if (thread_group_empty(tsk))
+ do_notify_pidfd(tsk);
if (sig != SIGCHLD) {
/*
@@ -3789,12 +3803,13 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
#endif
#endif
-static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
+static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info,
+ enum pid_type type)
{
clear_siginfo(info);
info->si_signo = sig;
info->si_errno = 0;
- info->si_code = SI_USER;
+ info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER;
info->si_pid = task_tgid_vnr(current);
info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
}
@@ -3808,7 +3823,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct kernel_siginfo info;
- prepare_kill_siginfo(sig, &info);
+ prepare_kill_siginfo(sig, &info, PIDTYPE_TGID);
return kill_something_info(sig, &info, pid);
}
@@ -3861,6 +3876,10 @@ static struct pid *pidfd_to_pid(const struct file *file)
return tgid_pidfd_to_pid(file);
}
+#define PIDFD_SEND_SIGNAL_FLAGS \
+ (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \
+ PIDFD_SIGNAL_PROCESS_GROUP)
+
/**
* sys_pidfd_send_signal - Signal a process through a pidfd
* @pidfd: file descriptor of the process
@@ -3868,14 +3887,10 @@ static struct pid *pidfd_to_pid(const struct file *file)
* @info: signal info
* @flags: future flags
*
- * The syscall currently only signals via PIDTYPE_PID which covers
- * kill(<positive-pid>, <signal>. It does not signal threads or process
- * groups.
- * In order to extend the syscall to threads and process groups the @flags
- * argument should be used. In essence, the @flags argument will determine
- * what is signaled and not the file descriptor itself. Put in other words,
- * grouping is a property of the flags argument not a property of the file
- * descriptor.
+ * Send the signal to the thread group or to the individual thread depending
+ * on PIDFD_THREAD.
+ * In the future extension to @flags may be used to override the default scope
+ * of @pidfd.
*
* Return: 0 on success, negative errno on failure
*/
@@ -3886,9 +3901,14 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
struct fd f;
struct pid *pid;
kernel_siginfo_t kinfo;
+ enum pid_type type;
/* Enforce flags be set to 0 until we add an extension. */
- if (flags)
+ if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
+ return -EINVAL;
+
+ /* Ensure that only a single signal scope determining flag is set. */
+ if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
return -EINVAL;
f = fdget(pidfd);
@@ -3906,6 +3926,25 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (!access_pidfd_pidns(pid))
goto err;
+ switch (flags) {
+ case 0:
+ /* Infer scope from the type of pidfd. */
+ if (f.file->f_flags & PIDFD_THREAD)
+ type = PIDTYPE_PID;
+ else
+ type = PIDTYPE_TGID;
+ break;
+ case PIDFD_SIGNAL_THREAD:
+ type = PIDTYPE_PID;
+ break;
+ case PIDFD_SIGNAL_THREAD_GROUP:
+ type = PIDTYPE_TGID;
+ break;
+ case PIDFD_SIGNAL_PROCESS_GROUP:
+ type = PIDTYPE_PGID;
+ break;
+ }
+
if (info) {
ret = copy_siginfo_from_user_any(&kinfo, info);
if (unlikely(ret))
@@ -3917,15 +3956,17 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
/* Only allow sending arbitrary signals to yourself. */
ret = -EPERM;
- if ((task_pid(current) != pid) &&
+ if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
(kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
goto err;
} else {
- prepare_kill_siginfo(sig, &kinfo);
+ prepare_kill_siginfo(sig, &kinfo, type);
}
- ret = kill_pid_info(sig, &kinfo, pid);
-
+ if (type == PIDTYPE_PGID)
+ ret = kill_pgrp_info(sig, &kinfo, pid);
+ else
+ ret = kill_pid_info_type(sig, &kinfo, pid, type);
err:
fdput(f);
return ret;
@@ -3965,12 +4006,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
struct kernel_siginfo info;
- clear_siginfo(&info);
- info.si_signo = sig;
- info.si_errno = 0;
- info.si_code = SI_TKILL;
- info.si_pid = task_tgid_vnr(current);
- info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+ prepare_kill_siginfo(sig, &info, PIDTYPE_PID);
return do_send_specific(tgid, pid, sig, &info);
}
diff --git a/tools/testing/selftests/pidfd/pidfd_getfd_test.c b/tools/testing/selftests/pidfd/pidfd_getfd_test.c
index 0930e2411dfb..cd51d547b751 100644
--- a/tools/testing/selftests/pidfd/pidfd_getfd_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_getfd_test.c
@@ -5,6 +5,7 @@
#include <fcntl.h>
#include <limits.h>
#include <linux/types.h>
+#include <poll.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
@@ -129,6 +130,7 @@ FIXTURE(child)
* When it is closed, the child will exit.
*/
int sk;
+ bool ignore_child_result;
};
FIXTURE_SETUP(child)
@@ -165,10 +167,14 @@ FIXTURE_SETUP(child)
FIXTURE_TEARDOWN(child)
{
+ int ret;
+
EXPECT_EQ(0, close(self->pidfd));
EXPECT_EQ(0, close(self->sk));
- EXPECT_EQ(0, wait_for_pid(self->pid));
+ ret = wait_for_pid(self->pid);
+ if (!self->ignore_child_result)
+ EXPECT_EQ(0, ret);
}
TEST_F(child, disable_ptrace)
@@ -235,6 +241,29 @@ TEST(flags_set)
EXPECT_EQ(errno, EINVAL);
}
+TEST_F(child, no_strange_EBADF)
+{
+ struct pollfd fds;
+
+ self->ignore_child_result = true;
+
+ fds.fd = self->pidfd;
+ fds.events = POLLIN;
+
+ ASSERT_EQ(kill(self->pid, SIGKILL), 0);
+ ASSERT_EQ(poll(&fds, 1, 5000), 1);
+
+ /*
+ * It used to be that pidfd_getfd() could race with the exiting thread
+ * between exit_files() and release_task(), and get a non-null task
+ * with a NULL files struct, and you'd get EBADF, which was slightly
+ * confusing.
+ */
+ errno = 0;
+ EXPECT_EQ(sys_pidfd_getfd(self->pidfd, self->remote_fd, 0), -1);
+ EXPECT_EQ(errno, ESRCH);
+}
+
#if __NR_pidfd_getfd == -1
int main(void)
{