summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile77
-rw-r--r--kernel/acct.c10
-rw-r--r--kernel/audit.c144
-rw-r--r--kernel/audit.h11
-rw-r--r--kernel/audit_watch.c5
-rw-r--r--kernel/auditfilter.c202
-rw-r--r--kernel/auditsc.c510
-rw-r--r--kernel/cgroup.c332
-rw-r--r--kernel/cgroup_freezer.c8
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/cred.c10
-rw-r--r--kernel/debug/debug_core.c32
-rw-r--r--kernel/debug/kdb/kdb_bt.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c33
-rw-r--r--kernel/debug/kdb/kdb_main.c33
-rw-r--r--kernel/events/core.c81
-rw-r--r--kernel/events/uprobes.c353
-rw-r--r--kernel/exit.c100
-rw-r--r--kernel/fork.c46
-rw-r--r--kernel/irq/irqdomain.c33
-rw-r--r--kernel/jump_label.c1
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/kmod.c7
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/modsign_pubkey.c113
-rw-r--r--kernel/module-internal.h14
-rw-r--r--kernel/module.c149
-rw-r--r--kernel/module_signing.c249
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/pid_namespace.c21
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/qos.c1
-rw-r--r--kernel/printk.c1
-rw-r--r--kernel/ptrace.c3
-rw-r--r--kernel/rcutree.c21
-rw-r--r--kernel/rcutree.h6
-rw-r--r--kernel/resource.c50
-rw-r--r--kernel/sched/core.c71
-rw-r--r--kernel/signal.c3
-rw-r--r--kernel/srcu.c4
-rw-r--r--kernel/sys.c29
-rw-r--r--kernel/sysctl.c15
-rw-r--r--kernel/taskstats.c39
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/alarmtimer.c118
-rw-r--r--kernel/time/clockevents.c24
-rw-r--r--kernel/time/jiffies.c32
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c119
-rw-r--r--kernel/timer.c10
-rw-r--r--kernel/trace/trace.c11
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_functions.c15
-rw-r--r--kernel/tsacct.c12
-rw-r--r--kernel/user.c8
-rw-r--r--kernel/user_namespace.c128
-rw-r--r--kernel/workqueue.c1217
60 files changed, 2846 insertions, 1692 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 5404911eaee9..0dfeca4324ee 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
@@ -130,3 +131,79 @@ quiet_cmd_timeconst = TIMEC $@
targets += timeconst.h
$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
$(call if_changed,timeconst)
+
+ifeq ($(CONFIG_MODULE_SIG),y)
+#
+# Pull the signing certificate and any extra certificates into the kernel
+#
+extra_certificates:
+ touch $@
+
+kernel/modsign_pubkey.o: signing_key.x509 extra_certificates
+
+###############################################################################
+#
+# If module signing is requested, say by allyesconfig, but a key has not been
+# supplied, then one will need to be generated to make sure the build does not
+# fail and that the kernel may be used afterwards.
+#
+###############################################################################
+sign_key_with_hash :=
+ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
+sign_key_with_hash := -sha1
+endif
+ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
+sign_key_with_hash := -sha224
+endif
+ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
+sign_key_with_hash := -sha256
+endif
+ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
+sign_key_with_hash := -sha384
+endif
+ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
+sign_key_with_hash := -sha512
+endif
+ifeq ($(sign_key_with_hash),)
+$(error Could not determine digest type to use from kernel config)
+endif
+
+signing_key.priv signing_key.x509: x509.genkey
+ @echo "###"
+ @echo "### Now generating an X.509 key pair to be used for signing modules."
+ @echo "###"
+ @echo "### If this takes a long time, you might wish to run rngd in the"
+ @echo "### background to keep the supply of entropy topped up. It"
+ @echo "### needs to be run as root, and should use a hardware random"
+ @echo "### number generator if one is available, eg:"
+ @echo "###"
+ @echo "### rngd -r /dev/hwrandom"
+ @echo "###"
+ openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \
+ -x509 -config x509.genkey \
+ -outform DER -out signing_key.x509 \
+ -keyout signing_key.priv
+ @echo "###"
+ @echo "### Key pair generated."
+ @echo "###"
+
+x509.genkey:
+ @echo Generating X.509 key generation config
+ @echo >x509.genkey "[ req ]"
+ @echo >>x509.genkey "default_bits = 4096"
+ @echo >>x509.genkey "distinguished_name = req_distinguished_name"
+ @echo >>x509.genkey "prompt = no"
+ @echo >>x509.genkey "string_mask = utf8only"
+ @echo >>x509.genkey "x509_extensions = myexts"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ req_distinguished_name ]"
+ @echo >>x509.genkey "O = Magrathea"
+ @echo >>x509.genkey "CN = Glacier signing key"
+ @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ myexts ]"
+ @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
+ @echo >>x509.genkey "keyUsage=digitalSignature"
+ @echo >>x509.genkey "subjectKeyIdentifier=hash"
+ @echo >>x509.genkey "authorityKeyIdentifier=keyid"
+endif
diff --git a/kernel/acct.c b/kernel/acct.c
index 02e6167a53b0..051e071a06e7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
}
}
-static int acct_on(char *name)
+static int acct_on(struct filename *pathname)
{
struct file *file;
struct vfsmount *mnt;
@@ -201,7 +201,7 @@ static int acct_on(char *name)
struct bsd_acct_struct *acct = NULL;
/* Difference from BSD - they don't do O_APPEND */
- file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
if (IS_ERR(file))
return PTR_ERR(file);
@@ -260,7 +260,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
return -EPERM;
if (name) {
- char *tmp = getname(name);
+ struct filename *tmp = getname(name);
if (IS_ERR(tmp))
return (PTR_ERR(tmp));
error = acct_on(tmp);
@@ -507,8 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
do_div(elapsed, AHZ);
ac.ac_btime = get_seconds() - elapsed;
/* we really need to bite the bullet and change layout */
- ac.ac_uid = orig_cred->uid;
- ac.ac_gid = orig_cred->gid;
+ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
+ ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
#if ACCT_VERSION==2
ac.ac_ahz = AHZ;
#endif
diff --git a/kernel/audit.c b/kernel/audit.c
index ea3b7b6191c7..40414e9143db 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -61,6 +61,7 @@
#include <linux/netlink.h>
#include <linux/freezer.h>
#include <linux/tty.h>
+#include <linux/pid_namespace.h>
#include "audit.h"
@@ -87,11 +88,11 @@ static int audit_failure = AUDIT_FAIL_PRINTK;
/*
* If audit records are to be written to the netlink socket, audit_pid
- * contains the pid of the auditd process and audit_nlk_pid contains
- * the pid to use to send netlink messages to that process.
+ * contains the pid of the auditd process and audit_nlk_portid contains
+ * the portid to use to send netlink messages to that process.
*/
int audit_pid;
-static int audit_nlk_pid;
+static int audit_nlk_portid;
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
@@ -104,7 +105,7 @@ static int audit_backlog_wait_time = 60 * HZ;
static int audit_backlog_wait_overflow = 0;
/* The identity of the user shutting down the audit system. */
-uid_t audit_sig_uid = -1;
+kuid_t audit_sig_uid = INVALID_UID;
pid_t audit_sig_pid = -1;
u32 audit_sig_sid = 0;
@@ -264,7 +265,7 @@ void audit_log_lost(const char *message)
}
static int audit_log_config_change(char *function_name, int new, int old,
- uid_t loginuid, u32 sessionid, u32 sid,
+ kuid_t loginuid, u32 sessionid, u32 sid,
int allow_changes)
{
struct audit_buffer *ab;
@@ -272,7 +273,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
- old, loginuid, sessionid);
+ old, from_kuid(&init_user_ns, loginuid), sessionid);
if (sid) {
char *ctx = NULL;
u32 len;
@@ -292,7 +293,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
}
static int audit_do_config_change(char *function_name, int *to_change,
- int new, uid_t loginuid, u32 sessionid,
+ int new, kuid_t loginuid, u32 sessionid,
u32 sid)
{
int allow_changes, rc = 0, old = *to_change;
@@ -319,21 +320,21 @@ static int audit_do_config_change(char *function_name, int *to_change,
return rc;
}
-static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
+static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid,
u32 sid)
{
return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
limit, loginuid, sessionid, sid);
}
-static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
+static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid,
u32 sid)
{
return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
limit, loginuid, sessionid, sid);
}
-static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid)
{
int rc;
if (state < AUDIT_OFF || state > AUDIT_LOCKED)
@@ -348,7 +349,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
return rc;
}
-static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid)
{
if (state != AUDIT_FAIL_SILENT
&& state != AUDIT_FAIL_PRINTK
@@ -401,7 +402,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
int err;
/* take a reference in case we can't send it and we want to hold it */
skb_get(skb);
- err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
+ err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
if (err < 0) {
BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
@@ -467,24 +468,6 @@ static int kauditd_thread(void *dummy)
return 0;
}
-static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
-{
- struct task_struct *tsk;
- int err;
-
- rcu_read_lock();
- tsk = find_task_by_vpid(pid);
- if (!tsk) {
- rcu_read_unlock();
- return -ESRCH;
- }
- get_task_struct(tsk);
- rcu_read_unlock();
- err = tty_audit_push_task(tsk, loginuid, sessionid);
- put_task_struct(tsk);
- return err;
-}
-
int audit_send_list(void *_dest)
{
struct audit_netlink_list *dest = _dest;
@@ -588,6 +571,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
{
int err = 0;
+ /* Only support the initial namespaces for now. */
+ if ((current_user_ns() != &init_user_ns) ||
+ (task_active_pid_ns(current) != &init_pid_ns))
+ return -EPERM;
+
switch (msg_type) {
case AUDIT_GET:
case AUDIT_LIST:
@@ -619,8 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
}
static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
- u32 pid, u32 uid, uid_t auid, u32 ses,
- u32 sid)
+ kuid_t auid, u32 ses, u32 sid)
{
int rc = 0;
char *ctx = NULL;
@@ -633,7 +620,9 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
- pid, uid, auid, ses);
+ task_tgid_vnr(current),
+ from_kuid(&init_user_ns, current_uid()),
+ from_kuid(&init_user_ns, auid), ses);
if (sid) {
rc = security_secid_to_secctx(sid, &ctx, &len);
if (rc)
@@ -649,13 +638,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- u32 uid, pid, seq, sid;
+ u32 seq, sid;
void *data;
struct audit_status *status_get, status_set;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
- uid_t loginuid; /* loginuid of sender */
+ kuid_t loginuid; /* loginuid of sender */
u32 sessionid;
struct audit_sig_info *sig_data;
char *ctx = NULL;
@@ -675,8 +664,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
}
- pid = NETLINK_CREDS(skb)->pid;
- uid = NETLINK_CREDS(skb)->uid;
loginuid = audit_get_loginuid(current);
sessionid = audit_get_sessionid(current);
security_task_getsecid(current, &sid);
@@ -692,7 +679,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
status_set.backlog_limit = audit_backlog_limit;
status_set.lost = atomic_read(&audit_lost);
status_set.backlog = skb_queue_len(&audit_skb_queue);
- audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
+ audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
&status_set, sizeof(status_set));
break;
case AUDIT_SET:
@@ -720,7 +707,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
sessionid, sid, 1);
audit_pid = new_pid;
- audit_nlk_pid = NETLINK_CB(skb).pid;
+ audit_nlk_portid = NETLINK_CB(skb).portid;
}
if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
err = audit_set_rate_limit(status_get->rate_limit,
@@ -738,16 +725,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
- err = audit_filter_user(&NETLINK_CB(skb));
+ err = audit_filter_user();
if (err == 1) {
err = 0;
if (msg_type == AUDIT_USER_TTY) {
- err = audit_prepare_user_tty(pid, loginuid,
+ err = tty_audit_push_task(current, loginuid,
sessionid);
if (err)
break;
}
- audit_log_common_recv_msg(&ab, msg_type, pid, uid,
+ audit_log_common_recv_msg(&ab, msg_type,
loginuid, sessionid, sid);
if (msg_type != AUDIT_USER_TTY)
@@ -763,7 +750,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
size--;
audit_log_n_untrustedstring(ab, data, size);
}
- audit_set_pid(ab, pid);
+ audit_set_pid(ab, NETLINK_CB(skb).portid);
audit_log_end(ab);
}
break;
@@ -772,8 +759,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (nlmsg_len(nlh) < sizeof(struct audit_rule))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
+ loginuid, sessionid, sid);
audit_log_format(ab, " audit_enabled=%d res=0",
audit_enabled);
@@ -782,8 +769,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
/* fallthrough */
case AUDIT_LIST:
- err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
- uid, seq, data, nlmsg_len(nlh),
+ err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
+ seq, data, nlmsg_len(nlh),
loginuid, sessionid, sid);
break;
case AUDIT_ADD_RULE:
@@ -791,8 +778,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
+ loginuid, sessionid, sid);
audit_log_format(ab, " audit_enabled=%d res=0",
audit_enabled);
@@ -801,15 +788,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
/* fallthrough */
case AUDIT_LIST_RULES:
- err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
- uid, seq, data, nlmsg_len(nlh),
+ err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
+ seq, data, nlmsg_len(nlh),
loginuid, sessionid, sid);
break;
case AUDIT_TRIM:
audit_trim_trees();
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
+ loginuid, sessionid, sid);
audit_log_format(ab, " op=trim res=1");
audit_log_end(ab);
@@ -840,8 +827,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* OK, here comes... */
err = audit_tag_tree(old, new);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
- uid, loginuid, sessionid, sid);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
+ loginuid, sessionid, sid);
audit_log_format(ab, " op=make_equiv old=");
audit_log_untrustedstring(ab, old);
@@ -866,53 +853,41 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
security_release_secctx(ctx, len);
return -ENOMEM;
}
- sig_data->uid = audit_sig_uid;
+ sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
sig_data->pid = audit_sig_pid;
if (audit_sig_sid) {
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
- audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
+ audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO,
0, 0, sig_data, sizeof(*sig_data) + len);
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
struct audit_tty_status s;
- struct task_struct *tsk;
- unsigned long flags;
-
- rcu_read_lock();
- tsk = find_task_by_vpid(pid);
- if (tsk && lock_task_sighand(tsk, &flags)) {
- s.enabled = tsk->signal->audit_tty != 0;
- unlock_task_sighand(tsk, &flags);
- } else
- err = -ESRCH;
- rcu_read_unlock();
-
- if (!err)
- audit_send_reply(NETLINK_CB(skb).pid, seq,
- AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
+ struct task_struct *tsk = current;
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ s.enabled = tsk->signal->audit_tty != 0;
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ audit_send_reply(NETLINK_CB(skb).portid, seq,
+ AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
struct audit_tty_status *s;
- struct task_struct *tsk;
- unsigned long flags;
+ struct task_struct *tsk = current;
if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
return -EINVAL;
s = data;
if (s->enabled != 0 && s->enabled != 1)
return -EINVAL;
- rcu_read_lock();
- tsk = find_task_by_vpid(pid);
- if (tsk && lock_task_sighand(tsk, &flags)) {
- tsk->signal->audit_tty = s->enabled != 0;
- unlock_task_sighand(tsk, &flags);
- } else
- err = -ESRCH;
- rcu_read_unlock();
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ tsk->signal->audit_tty = s->enabled != 0;
+ spin_unlock_irq(&tsk->sighand->siglock);
break;
}
default:
@@ -971,8 +946,7 @@ static int __init audit_init(void)
printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
audit_default ? "enabled" : "disabled");
- audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT,
- THIS_MODULE, &cfg);
+ audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg);
if (!audit_sock)
audit_panic("cannot initialize netlink socket");
else
@@ -1466,6 +1440,8 @@ void audit_log_link_denied(const char *operation, struct path *link)
ab = audit_log_start(current->audit_context, GFP_KERNEL,
AUDIT_ANOM_LINK);
+ if (!ab)
+ return;
audit_log_format(ab, "op=%s action=denied", operation);
audit_log_format(ab, " pid=%d comm=", current->pid);
audit_log_untrustedstring(ab, current->comm);
diff --git a/kernel/audit.h b/kernel/audit.h
index 816766803371..d51cba868e1b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -74,10 +74,15 @@ static inline int audit_hash_ino(u32 ino)
return (ino & (AUDIT_INODE_BUCKETS-1));
}
+/* Indicates that audit should log the full pathname. */
+#define AUDIT_NAME_FULL -1
+
extern int audit_match_class(int class, unsigned syscall);
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
-extern int audit_compare_dname_path(const char *dname, const char *path,
- int *dirlen);
+extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
+extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
+extern int parent_len(const char *path);
+extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
int done, int multi,
const void *payload, int size);
@@ -144,7 +149,7 @@ extern void audit_kill_trees(struct list_head *);
extern char *audit_unpack_string(void **, size_t *, size_t);
extern pid_t audit_sig_pid;
-extern uid_t audit_sig_uid;
+extern kuid_t audit_sig_uid;
extern u32 audit_sig_sid;
#ifdef CONFIG_AUDITSYSCALL
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 3823281401b5..9a9ae6e3d290 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -241,7 +241,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, "auid=%u ses=%u op=",
- audit_get_loginuid(current),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
audit_get_sessionid(current));
audit_log_string(ab, op);
audit_log_format(ab, " path=");
@@ -265,7 +265,8 @@ static void audit_update_watch(struct audit_parent *parent,
/* Run all of the watches on this parent looking for the one that
* matches the given dname */
list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
- if (audit_compare_dname_path(dname, owatch->path, NULL))
+ if (audit_compare_dname_path(dname, owatch->path,
+ AUDIT_NAME_FULL))
continue;
/* If the update involves invalidating rules, do the inode-based
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a6c3f1abd206..7f19f23d38a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -342,6 +342,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
f->val = rule->values[i];
+ f->uid = INVALID_UID;
+ f->gid = INVALID_GID;
err = -EINVAL;
if (f->op == Audit_bad)
@@ -350,16 +352,32 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
switch(f->type) {
default:
goto exit_free;
- case AUDIT_PID:
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ /* bit ops not implemented for uid comparisons */
+ if (f->op == Audit_bitmask || f->op == Audit_bittest)
+ goto exit_free;
+
+ f->uid = make_kuid(current_user_ns(), f->val);
+ if (!uid_valid(f->uid))
+ goto exit_free;
+ break;
case AUDIT_GID:
case AUDIT_EGID:
case AUDIT_SGID:
case AUDIT_FSGID:
- case AUDIT_LOGINUID:
+ /* bit ops not implemented for gid comparisons */
+ if (f->op == Audit_bitmask || f->op == Audit_bittest)
+ goto exit_free;
+
+ f->gid = make_kgid(current_user_ns(), f->val);
+ if (!gid_valid(f->gid))
+ goto exit_free;
+ break;
+ case AUDIT_PID:
case AUDIT_PERS:
case AUDIT_MSGTYPE:
case AUDIT_PPID:
@@ -437,19 +455,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
f->type = data->fields[i];
f->val = data->values[i];
+ f->uid = INVALID_UID;
+ f->gid = INVALID_GID;
f->lsm_str = NULL;
f->lsm_rule = NULL;
switch(f->type) {
- case AUDIT_PID:
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ case AUDIT_OBJ_UID:
+ /* bit ops not implemented for uid comparisons */
+ if (f->op == Audit_bitmask || f->op == Audit_bittest)
+ goto exit_free;
+
+ f->uid = make_kuid(current_user_ns(), f->val);
+ if (!uid_valid(f->uid))
+ goto exit_free;
+ break;
case AUDIT_GID:
case AUDIT_EGID:
case AUDIT_SGID:
case AUDIT_FSGID:
- case AUDIT_LOGINUID:
+ case AUDIT_OBJ_GID:
+ /* bit ops not implemented for gid comparisons */
+ if (f->op == Audit_bitmask || f->op == Audit_bittest)
+ goto exit_free;
+
+ f->gid = make_kgid(current_user_ns(), f->val);
+ if (!gid_valid(f->gid))
+ goto exit_free;
+ break;
+ case AUDIT_PID:
case AUDIT_PERS:
case AUDIT_MSGTYPE:
case AUDIT_PPID:
@@ -461,8 +499,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_ARG1:
case AUDIT_ARG2:
case AUDIT_ARG3:
- case AUDIT_OBJ_UID:
- case AUDIT_OBJ_GID:
break;
case AUDIT_ARCH:
entry->rule.arch_f = f;
@@ -707,6 +743,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
if (strcmp(a->filterkey, b->filterkey))
return 1;
break;
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ case AUDIT_OBJ_UID:
+ if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
+ return 1;
+ break;
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_OBJ_GID:
+ if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
+ return 1;
+ break;
default:
if (a->fields[i].val != b->fields[i].val)
return 1;
@@ -1056,7 +1109,7 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
}
/* Log rule additions and removals */
-static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
+static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
char *action, struct audit_krule *rule,
int res)
{
@@ -1068,7 +1121,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
- audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
+ audit_log_format(ab, "auid=%u ses=%u",
+ from_kuid(&init_user_ns, loginuid), sessionid);
if (sid) {
char *ctx = NULL;
u32 len;
@@ -1098,8 +1152,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
* @sessionid: sessionid for netlink audit message
* @sid: SE Linux Security ID of sender
*/
-int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
- size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
+int audit_receive_filter(int type, int pid, int seq, void *data,
+ size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid)
{
struct task_struct *tsk;
struct audit_netlink_list *dest;
@@ -1198,46 +1252,110 @@ int audit_comparator(u32 left, u32 op, u32 right)
}
}
-/* Compare given dentry name with last component in given path,
- * return of 0 indicates a match. */
-int audit_compare_dname_path(const char *dname, const char *path,
- int *dirlen)
+int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
{
- int dlen, plen;
- const char *p;
+ switch (op) {
+ case Audit_equal:
+ return uid_eq(left, right);
+ case Audit_not_equal:
+ return !uid_eq(left, right);
+ case Audit_lt:
+ return uid_lt(left, right);
+ case Audit_le:
+ return uid_lte(left, right);
+ case Audit_gt:
+ return uid_gt(left, right);
+ case Audit_ge:
+ return uid_gte(left, right);
+ case Audit_bitmask:
+ case Audit_bittest:
+ default:
+ BUG();
+ return 0;
+ }
+}
- if (!dname || !path)
- return 1;
+int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
+{
+ switch (op) {
+ case Audit_equal:
+ return gid_eq(left, right);
+ case Audit_not_equal:
+ return !gid_eq(left, right);
+ case Audit_lt:
+ return gid_lt(left, right);
+ case Audit_le:
+ return gid_lte(left, right);
+ case Audit_gt:
+ return gid_gt(left, right);
+ case Audit_ge:
+ return gid_gte(left, right);
+ case Audit_bitmask:
+ case Audit_bittest:
+ default:
+ BUG();
+ return 0;
+ }
+}
+
+/**
+ * parent_len - find the length of the parent portion of a pathname
+ * @path: pathname of which to determine length
+ */
+int parent_len(const char *path)
+{
+ int plen;
+ const char *p;
- dlen = strlen(dname);
plen = strlen(path);
- if (plen < dlen)
- return 1;
+
+ if (plen == 0)
+ return plen;
/* disregard trailing slashes */
p = path + plen - 1;
while ((*p == '/') && (p > path))
p--;
- /* find last path component */
- p = p - dlen + 1;
- if (p < path)
+ /* walk backward until we find the next slash or hit beginning */
+ while ((*p != '/') && (p > path))
+ p--;
+
+ /* did we find a slash? Then increment to include it in path */
+ if (*p == '/')
+ p++;
+
+ return p - path;
+}
+
+/**
+ * audit_compare_dname_path - compare given dentry name with last component in
+ * given path. Return of 0 indicates a match.
+ * @dname: dentry name that we're comparing
+ * @path: full pathname that we're comparing
+ * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
+ * here indicates that we must compute this value.
+ */
+int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
+{
+ int dlen, pathlen;
+ const char *p;
+
+ dlen = strlen(dname);
+ pathlen = strlen(path);
+ if (pathlen < dlen)
return 1;
- else if (p > path) {
- if (*--p != '/')
- return 1;
- else
- p++;
- }
- /* return length of path's directory component */
- if (dirlen)
- *dirlen = p - path;
+ parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
+ if (pathlen - parentlen != dlen)
+ return 1;
+
+ p = path + parentlen;
+
return strncmp(p, dname, dlen);
}
-static int audit_filter_user_rules(struct netlink_skb_parms *cb,
- struct audit_krule *rule,
+static int audit_filter_user_rules(struct audit_krule *rule,
enum audit_state *state)
{
int i;
@@ -1249,17 +1367,17 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
switch (f->type) {
case AUDIT_PID:
- result = audit_comparator(cb->creds.pid, f->op, f->val);
+ result = audit_comparator(task_pid_vnr(current), f->op, f->val);
break;
case AUDIT_UID:
- result = audit_comparator(cb->creds.uid, f->op, f->val);
+ result = audit_uid_comparator(current_uid(), f->op, f->uid);
break;
case AUDIT_GID:
- result = audit_comparator(cb->creds.gid, f->op, f->val);
+ result = audit_gid_comparator(current_gid(), f->op, f->gid);
break;
case AUDIT_LOGINUID:
- result = audit_comparator(audit_get_loginuid(current),
- f->op, f->val);
+ result = audit_uid_comparator(audit_get_loginuid(current),
+ f->op, f->uid);
break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
@@ -1287,7 +1405,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
return 1;
}
-int audit_filter_user(struct netlink_skb_parms *cb)
+int audit_filter_user(void)
{
enum audit_state state = AUDIT_DISABLED;
struct audit_entry *e;
@@ -1295,7 +1413,7 @@ int audit_filter_user(struct netlink_skb_parms *cb)
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
- if (audit_filter_user_rules(cb, &e->rule, &state)) {
+ if (audit_filter_user_rules(&e->rule, &state)) {
if (state == AUDIT_DISABLED)
ret = 0;
break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4b96415527b8..2f186ed80c40 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -81,9 +81,6 @@
* a name dynamically and also add those to the list anchored by names_list. */
#define AUDIT_NAMES 5
-/* Indicates that audit should log the full pathname. */
-#define AUDIT_NAME_FULL -1
-
/* no execve audit message should be longer than this (userspace limits) */
#define MAX_EXECVE_AUDIT_LEN 7500
@@ -106,27 +103,29 @@ struct audit_cap_data {
* we don't let putname() free it (instead we free all of the saved
* pointers at syscall exit time).
*
- * Further, in fs/namei.c:path_lookup() we store the inode and device. */
+ * Further, in fs/namei.c:path_lookup() we store the inode and device.
+ */
struct audit_names {
- struct list_head list; /* audit_context->names_list */
- const char *name;
- unsigned long ino;
- dev_t dev;
- umode_t mode;
- uid_t uid;
- gid_t gid;
- dev_t rdev;
- u32 osid;
- struct audit_cap_data fcap;
- unsigned int fcap_ver;
- int name_len; /* number of name's characters to log */
- bool name_put; /* call __putname() for this name */
+ struct list_head list; /* audit_context->names_list */
+ struct filename *name;
+ unsigned long ino;
+ dev_t dev;
+ umode_t mode;
+ kuid_t uid;
+ kgid_t gid;
+ dev_t rdev;
+ u32 osid;
+ struct audit_cap_data fcap;
+ unsigned int fcap_ver;
+ int name_len; /* number of name's characters to log */
+ unsigned char type; /* record type */
+ bool name_put; /* call __putname() for this name */
/*
* This was an allocated audit_names and not from the array of
* names allocated in the task audit context. Thus this name
* should be freed on syscall exit
*/
- bool should_free;
+ bool should_free;
};
struct audit_aux_data {
@@ -149,8 +148,8 @@ struct audit_aux_data_execve {
struct audit_aux_data_pids {
struct audit_aux_data d;
pid_t target_pid[AUDIT_AUX_PIDS];
- uid_t target_auid[AUDIT_AUX_PIDS];
- uid_t target_uid[AUDIT_AUX_PIDS];
+ kuid_t target_auid[AUDIT_AUX_PIDS];
+ kuid_t target_uid[AUDIT_AUX_PIDS];
unsigned int target_sessionid[AUDIT_AUX_PIDS];
u32 target_sid[AUDIT_AUX_PIDS];
char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
@@ -208,14 +207,14 @@ struct audit_context {
size_t sockaddr_len;
/* Save things to print about task_struct */
pid_t pid, ppid;
- uid_t uid, euid, suid, fsuid;
- gid_t gid, egid, sgid, fsgid;
+ kuid_t uid, euid, suid, fsuid;
+ kgid_t gid, egid, sgid, fsgid;
unsigned long personality;
int arch;
pid_t target_pid;
- uid_t target_auid;
- uid_t target_uid;
+ kuid_t target_auid;
+ kuid_t target_uid;
unsigned int target_sessionid;
u32 target_sid;
char target_comm[TASK_COMM_LEN];
@@ -231,8 +230,8 @@ struct audit_context {
long args[6];
} socketcall;
struct {
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
umode_t mode;
u32 osid;
int has_perm;
@@ -464,37 +463,47 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
return 0;
}
-static int audit_compare_id(uid_t uid1,
- struct audit_names *name,
- unsigned long name_offset,
- struct audit_field *f,
- struct audit_context *ctx)
+static int audit_compare_uid(kuid_t uid,
+ struct audit_names *name,
+ struct audit_field *f,
+ struct audit_context *ctx)
{
struct audit_names *n;
- unsigned long addr;
- uid_t uid2;
int rc;
-
- BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
-
+
if (name) {
- addr = (unsigned long)name;
- addr += name_offset;
-
- uid2 = *(uid_t *)addr;
- rc = audit_comparator(uid1, f->op, uid2);
+ rc = audit_uid_comparator(uid, f->op, name->uid);
if (rc)
return rc;
}
-
+
if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
- addr = (unsigned long)n;
- addr += name_offset;
-
- uid2 = *(uid_t *)addr;
+ rc = audit_uid_comparator(uid, f->op, n->uid);
+ if (rc)
+ return rc;
+ }
+ }
+ return 0;
+}
- rc = audit_comparator(uid1, f->op, uid2);
+static int audit_compare_gid(kgid_t gid,
+ struct audit_names *name,
+ struct audit_field *f,
+ struct audit_context *ctx)
+{
+ struct audit_names *n;
+ int rc;
+
+ if (name) {
+ rc = audit_gid_comparator(gid, f->op, name->gid);
+ if (rc)
+ return rc;
+ }
+
+ if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ rc = audit_gid_comparator(gid, f->op, n->gid);
if (rc)
return rc;
}
@@ -511,80 +520,62 @@ static int audit_field_compare(struct task_struct *tsk,
switch (f->val) {
/* process to file object comparisons */
case AUDIT_COMPARE_UID_TO_OBJ_UID:
- return audit_compare_id(cred->uid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(cred->uid, name, f, ctx);
case AUDIT_COMPARE_GID_TO_OBJ_GID:
- return audit_compare_id(cred->gid,
- name, offsetof(struct audit_names, gid),
- f, ctx);
+ return audit_compare_gid(cred->gid, name, f, ctx);
case AUDIT_COMPARE_EUID_TO_OBJ_UID:
- return audit_compare_id(cred->euid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(cred->euid, name, f, ctx);
case AUDIT_COMPARE_EGID_TO_OBJ_GID:
- return audit_compare_id(cred->egid,
- name, offsetof(struct audit_names, gid),
- f, ctx);
+ return audit_compare_gid(cred->egid, name, f, ctx);
case AUDIT_COMPARE_AUID_TO_OBJ_UID:
- return audit_compare_id(tsk->loginuid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(tsk->loginuid, name, f, ctx);
case AUDIT_COMPARE_SUID_TO_OBJ_UID:
- return audit_compare_id(cred->suid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(cred->suid, name, f, ctx);
case AUDIT_COMPARE_SGID_TO_OBJ_GID:
- return audit_compare_id(cred->sgid,
- name, offsetof(struct audit_names, gid),
- f, ctx);
+ return audit_compare_gid(cred->sgid, name, f, ctx);
case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
- return audit_compare_id(cred->fsuid,
- name, offsetof(struct audit_names, uid),
- f, ctx);
+ return audit_compare_uid(cred->fsuid, name, f, ctx);
case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
- return audit_compare_id(cred->fsgid,
- name, offsetof(struct audit_names, gid),
- f, ctx);
+ return audit_compare_gid(cred->fsgid, name, f, ctx);
/* uid comparisons */
case AUDIT_COMPARE_UID_TO_AUID:
- return audit_comparator(cred->uid, f->op, tsk->loginuid);
+ return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
case AUDIT_COMPARE_UID_TO_EUID:
- return audit_comparator(cred->uid, f->op, cred->euid);
+ return audit_uid_comparator(cred->uid, f->op, cred->euid);
case AUDIT_COMPARE_UID_TO_SUID:
- return audit_comparator(cred->uid, f->op, cred->suid);
+ return audit_uid_comparator(cred->uid, f->op, cred->suid);
case AUDIT_COMPARE_UID_TO_FSUID:
- return audit_comparator(cred->uid, f->op, cred->fsuid);
+ return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
/* auid comparisons */
case AUDIT_COMPARE_AUID_TO_EUID:
- return audit_comparator(tsk->loginuid, f->op, cred->euid);
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
case AUDIT_COMPARE_AUID_TO_SUID:
- return audit_comparator(tsk->loginuid, f->op, cred->suid);
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
case AUDIT_COMPARE_AUID_TO_FSUID:
- return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
/* euid comparisons */
case AUDIT_COMPARE_EUID_TO_SUID:
- return audit_comparator(cred->euid, f->op, cred->suid);
+ return audit_uid_comparator(cred->euid, f->op, cred->suid);
case AUDIT_COMPARE_EUID_TO_FSUID:
- return audit_comparator(cred->euid, f->op, cred->fsuid);
+ return audit_uid_comparator(cred->euid, f->op, cred->fsuid);
/* suid comparisons */
case AUDIT_COMPARE_SUID_TO_FSUID:
- return audit_comparator(cred->suid, f->op, cred->fsuid);
+ return audit_uid_comparator(cred->suid, f->op, cred->fsuid);
/* gid comparisons */
case AUDIT_COMPARE_GID_TO_EGID:
- return audit_comparator(cred->gid, f->op, cred->egid);
+ return audit_gid_comparator(cred->gid, f->op, cred->egid);
case AUDIT_COMPARE_GID_TO_SGID:
- return audit_comparator(cred->gid, f->op, cred->sgid);
+ return audit_gid_comparator(cred->gid, f->op, cred->sgid);
case AUDIT_COMPARE_GID_TO_FSGID:
- return audit_comparator(cred->gid, f->op, cred->fsgid);
+ return audit_gid_comparator(cred->gid, f->op, cred->fsgid);
/* egid comparisons */
case AUDIT_COMPARE_EGID_TO_SGID:
- return audit_comparator(cred->egid, f->op, cred->sgid);
+ return audit_gid_comparator(cred->egid, f->op, cred->sgid);
case AUDIT_COMPARE_EGID_TO_FSGID:
- return audit_comparator(cred->egid, f->op, cred->fsgid);
+ return audit_gid_comparator(cred->egid, f->op, cred->fsgid);
/* sgid comparison */
case AUDIT_COMPARE_SGID_TO_FSGID:
- return audit_comparator(cred->sgid, f->op, cred->fsgid);
+ return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);
default:
WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
return 0;
@@ -630,28 +621,28 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_UID:
- result = audit_comparator(cred->uid, f->op, f->val);
+ result = audit_uid_comparator(cred->uid, f->op, f->uid);
break;
case AUDIT_EUID:
- result = audit_comparator(cred->euid, f->op, f->val);
+ result = audit_uid_comparator(cred->euid, f->op, f->uid);
break;
case AUDIT_SUID:
- result = audit_comparator(cred->suid, f->op, f->val);
+ result = audit_uid_comparator(cred->suid, f->op, f->uid);
break;
case AUDIT_FSUID:
- result = audit_comparator(cred->fsuid, f->op, f->val);
+ result = audit_uid_comparator(cred->fsuid, f->op, f->uid);
break;
case AUDIT_GID:
- result = audit_comparator(cred->gid, f->op, f->val);
+ result = audit_gid_comparator(cred->gid, f->op, f->gid);
break;
case AUDIT_EGID:
- result = audit_comparator(cred->egid, f->op, f->val);
+ result = audit_gid_comparator(cred->egid, f->op, f->gid);
break;
case AUDIT_SGID:
- result = audit_comparator(cred->sgid, f->op, f->val);
+ result = audit_gid_comparator(cred->sgid, f->op, f->gid);
break;
case AUDIT_FSGID:
- result = audit_comparator(cred->fsgid, f->op, f->val);
+ result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
break;
case AUDIT_PERS:
result = audit_comparator(tsk->personality, f->op, f->val);
@@ -717,10 +708,10 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_OBJ_UID:
if (name) {
- result = audit_comparator(name->uid, f->op, f->val);
+ result = audit_uid_comparator(name->uid, f->op, f->uid);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
- if (audit_comparator(n->uid, f->op, f->val)) {
+ if (audit_uid_comparator(n->uid, f->op, f->uid)) {
++result;
break;
}
@@ -729,10 +720,10 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_OBJ_GID:
if (name) {
- result = audit_comparator(name->gid, f->op, f->val);
+ result = audit_gid_comparator(name->gid, f->op, f->gid);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
- if (audit_comparator(n->gid, f->op, f->val)) {
+ if (audit_gid_comparator(n->gid, f->op, f->gid)) {
++result;
break;
}
@@ -750,7 +741,7 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_LOGINUID:
result = 0;
if (ctx)
- result = audit_comparator(tsk->loginuid, f->op, f->val);
+ result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
@@ -1006,7 +997,7 @@ static inline void audit_free_names(struct audit_context *context)
context->ino_count);
list_for_each_entry(n, &context->names_list, list) {
printk(KERN_ERR "names[%d] = %p = %s\n", i,
- n->name, n->name ?: "(null)");
+ n->name, n->name->name ?: "(null)");
}
dump_stack();
return;
@@ -1154,13 +1145,43 @@ error_path:
EXPORT_SYMBOL(audit_log_task_context);
-static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
{
+ const struct cred *cred;
char name[sizeof(tsk->comm)];
struct mm_struct *mm = tsk->mm;
- struct vm_area_struct *vma;
+ char *tty;
+
+ if (!ab)
+ return;
/* tsk == current */
+ cred = current_cred();
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+ tty = tsk->signal->tty->name;
+ else
+ tty = "(none)";
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+
+ audit_log_format(ab,
+ " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
+ " euid=%u suid=%u fsuid=%u"
+ " egid=%u sgid=%u fsgid=%u ses=%u tty=%s",
+ sys_getppid(),
+ tsk->pid,
+ from_kuid(&init_user_ns, tsk->loginuid),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kgid(&init_user_ns, cred->gid),
+ from_kuid(&init_user_ns, cred->euid),
+ from_kuid(&init_user_ns, cred->suid),
+ from_kuid(&init_user_ns, cred->fsuid),
+ from_kgid(&init_user_ns, cred->egid),
+ from_kgid(&init_user_ns, cred->sgid),
+ from_kgid(&init_user_ns, cred->fsgid),
+ tsk->sessionid, tty);
get_task_comm(name, tsk);
audit_log_format(ab, " comm=");
@@ -1168,23 +1189,17 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
if (mm) {
down_read(&mm->mmap_sem);
- vma = mm->mmap;
- while (vma) {
- if ((vma->vm_flags & VM_EXECUTABLE) &&
- vma->vm_file) {
- audit_log_d_path(ab, " exe=",
- &vma->vm_file->f_path);
- break;
- }
- vma = vma->vm_next;
- }
+ if (mm->exe_file)
+ audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
up_read(&mm->mmap_sem);
}
audit_log_task_context(ab);
}
+EXPORT_SYMBOL(audit_log_task_info);
+
static int audit_log_pid_context(struct audit_context *context, pid_t pid,
- uid_t auid, uid_t uid, unsigned int sessionid,
+ kuid_t auid, kuid_t uid, unsigned int sessionid,
u32 sid, char *comm)
{
struct audit_buffer *ab;
@@ -1196,8 +1211,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
if (!ab)
return rc;
- audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid,
- uid, sessionid);
+ audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
+ from_kuid(&init_user_ns, auid),
+ from_kuid(&init_user_ns, uid), sessionid);
if (security_secid_to_secctx(sid, &ctx, &len)) {
audit_log_format(ab, " obj=(none)");
rc = 1;
@@ -1447,7 +1463,9 @@ static void show_special(struct audit_context *context, int *call_panic)
u32 osid = context->ipc.osid;
audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
- context->ipc.uid, context->ipc.gid, context->ipc.mode);
+ from_kuid(&init_user_ns, context->ipc.uid),
+ from_kgid(&init_user_ns, context->ipc.gid),
+ context->ipc.mode);
if (osid) {
char *ctx = NULL;
u32 len;
@@ -1536,7 +1554,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
case AUDIT_NAME_FULL:
/* log the full path */
audit_log_format(ab, " name=");
- audit_log_untrustedstring(ab, n->name);
+ audit_log_untrustedstring(ab, n->name->name);
break;
case 0:
/* name was specified as a relative path and the
@@ -1546,7 +1564,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
default:
/* log the name's directory component */
audit_log_format(ab, " name=");
- audit_log_n_untrustedstring(ab, n->name,
+ audit_log_n_untrustedstring(ab, n->name->name,
n->name_len);
}
} else
@@ -1560,8 +1578,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
MAJOR(n->dev),
MINOR(n->dev),
n->mode,
- n->uid,
- n->gid,
+ from_kuid(&init_user_ns, n->uid),
+ from_kgid(&init_user_ns, n->gid),
MAJOR(n->rdev),
MINOR(n->rdev));
}
@@ -1585,26 +1603,12 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
{
- const struct cred *cred;
int i, call_panic = 0;
struct audit_buffer *ab;
struct audit_aux_data *aux;
- const char *tty;
struct audit_names *n;
/* tsk == current */
- context->pid = tsk->pid;
- if (!context->ppid)
- context->ppid = sys_getppid();
- cred = current_cred();
- context->uid = cred->uid;
- context->gid = cred->gid;
- context->euid = cred->euid;
- context->suid = cred->suid;
- context->fsuid = cred->fsuid;
- context->egid = cred->egid;
- context->sgid = cred->sgid;
- context->fsgid = cred->fsgid;
context->personality = tsk->personality;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1619,32 +1623,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
context->return_code);
- spin_lock_irq(&tsk->sighand->siglock);
- if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
- tty = tsk->signal->tty->name;
- else
- tty = "(none)";
- spin_unlock_irq(&tsk->sighand->siglock);
-
audit_log_format(ab,
- " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
- " ppid=%d pid=%d auid=%u uid=%u gid=%u"
- " euid=%u suid=%u fsuid=%u"
- " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
- context->argv[0],
- context->argv[1],
- context->argv[2],
- context->argv[3],
- context->name_count,
- context->ppid,
- context->pid,
- tsk->loginuid,
- context->uid,
- context->gid,
- context->euid, context->suid, context->fsuid,
- context->egid, context->sgid, context->fsgid, tty,
- tsk->sessionid);
-
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count);
audit_log_task_info(ab, tsk);
audit_log_key(ab, context->filterkey);
@@ -2009,7 +1994,8 @@ retry:
#endif
}
-static struct audit_names *audit_alloc_name(struct audit_context *context)
+static struct audit_names *audit_alloc_name(struct audit_context *context,
+ unsigned char type)
{
struct audit_names *aname;
@@ -2024,6 +2010,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
}
aname->ino = (unsigned long)-1;
+ aname->type = type;
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
@@ -2034,13 +2021,36 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
}
/**
+ * audit_reusename - fill out filename with info from existing entry
+ * @uptr: userland ptr to pathname
+ *
+ * Search the audit_names list for the current audit context. If there is an
+ * existing entry with a matching "uptr" then return the filename
+ * associated with that audit_name. If not, return NULL.
+ */
+struct filename *
+__audit_reusename(const __user char *uptr)
+{
+ struct audit_context *context = current->audit_context;
+ struct audit_names *n;
+
+ list_for_each_entry(n, &context->names_list, list) {
+ if (!n->name)
+ continue;
+ if (n->name->uptr == uptr)
+ return n->name;
+ }
+ return NULL;
+}
+
+/**
* audit_getname - add a name to the list
* @name: name to add
*
* Add a name to the list of audit names for this context.
* Called from fs/namei.c:getname().
*/
-void __audit_getname(const char *name)
+void __audit_getname(struct filename *name)
{
struct audit_context *context = current->audit_context;
struct audit_names *n;
@@ -2054,13 +2064,19 @@ void __audit_getname(const char *name)
return;
}
- n = audit_alloc_name(context);
+#if AUDIT_DEBUG
+ /* The filename _must_ have a populated ->name */
+ BUG_ON(!name->name);
+#endif
+
+ n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
return;
n->name = name;
n->name_len = AUDIT_NAME_FULL;
n->name_put = true;
+ name->aname = n;
if (!context->pwd.dentry)
get_fs_pwd(current->fs, &context->pwd);
@@ -2073,7 +2089,7 @@ void __audit_getname(const char *name)
* then we delay the putname until syscall exit.
* Called from include/linux/fs.h:putname().
*/
-void audit_putname(const char *name)
+void audit_putname(struct filename *name)
{
struct audit_context *context = current->audit_context;
@@ -2088,7 +2104,7 @@ void audit_putname(const char *name)
list_for_each_entry(n, &context->names_list, list)
printk(KERN_ERR "name[%d] = %p = %s\n", i,
- n->name, n->name ?: "(null)");
+ n->name, n->name->name ?: "(null)");
}
#endif
__putname(name);
@@ -2102,8 +2118,8 @@ void audit_putname(const char *name)
" put_count=%d\n",
__FILE__, __LINE__,
context->serial, context->major,
- context->in_syscall, name, context->name_count,
- context->put_count);
+ context->in_syscall, name->name,
+ context->name_count, context->put_count);
dump_stack();
}
}
@@ -2146,13 +2162,13 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
}
/**
- * audit_inode - store the inode and device from a lookup
+ * __audit_inode - store the inode and device from a lookup
* @name: name being audited
* @dentry: dentry being audited
- *
- * Called from fs/namei.c:path_lookup().
+ * @parent: does this dentry represent the parent?
*/
-void __audit_inode(const char *name, const struct dentry *dentry)
+void __audit_inode(struct filename *name, const struct dentry *dentry,
+ unsigned int parent)
{
struct audit_context *context = current->audit_context;
const struct inode *inode = dentry->d_inode;
@@ -2161,24 +2177,69 @@ void __audit_inode(const char *name, const struct dentry *dentry)
if (!context->in_syscall)
return;
+ if (!name)
+ goto out_alloc;
+
+#if AUDIT_DEBUG
+ /* The struct filename _must_ have a populated ->name */
+ BUG_ON(!name->name);
+#endif
+ /*
+ * If we have a pointer to an audit_names entry already, then we can
+ * just use it directly if the type is correct.
+ */
+ n = name->aname;
+ if (n) {
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
+ }
+
list_for_each_entry_reverse(n, &context->names_list, list) {
- if (n->name && (n->name == name))
- goto out;
+ /* does the name pointer match? */
+ if (!n->name || n->name->name != name->name)
+ continue;
+
+ /* match the correct record type */
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
}
- /* unable to find the name from a previous getname() */
- n = audit_alloc_name(context);
+out_alloc:
+ /* unable to find the name from a previous getname(). Allocate a new
+ * anonymous entry.
+ */
+ n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
if (!n)
return;
out:
+ if (parent) {
+ n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_PARENT;
+ } else {
+ n->name_len = AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_NORMAL;
+ }
handle_path(dentry);
audit_copy_inode(n, dentry, inode);
}
/**
- * audit_inode_child - collect inode info for created/removed objects
- * @dentry: dentry being audited
+ * __audit_inode_child - collect inode info for created/removed objects
* @parent: inode of dentry parent
+ * @dentry: dentry being audited
+ * @type: AUDIT_TYPE_* value that we're looking for
*
* For syscalls that create or remove filesystem objects, audit_inode
* can only collect information for the filesystem object's parent.
@@ -2188,15 +2249,14 @@ out:
* must be hooked prior, in order to capture the target inode during
* unsuccessful attempts.
*/
-void __audit_inode_child(const struct dentry *dentry,
- const struct inode *parent)
+void __audit_inode_child(const struct inode *parent,
+ const struct dentry *dentry,
+ const unsigned char type)
{
struct audit_context *context = current->audit_context;
- const char *found_parent = NULL, *found_child = NULL;
const struct inode *inode = dentry->d_inode;
const char *dname = dentry->d_name.name;
- struct audit_names *n;
- int dirlen = 0;
+ struct audit_names *n, *found_parent = NULL, *found_child = NULL;
if (!context->in_syscall)
return;
@@ -2204,62 +2264,65 @@ void __audit_inode_child(const struct dentry *dentry,
if (inode)
handle_one(inode);
- /* parent is more likely, look for it first */
+ /* look for a parent entry first */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name)
+ if (!n->name || n->type != AUDIT_TYPE_PARENT)
continue;
if (n->ino == parent->i_ino &&
- !audit_compare_dname_path(dname, n->name, &dirlen)) {
- n->name_len = dirlen; /* update parent data in place */
- found_parent = n->name;
- goto add_names;
+ !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
+ found_parent = n;
+ break;
}
}
- /* no matching parent, look for matching child */
+ /* is there a matching child entry? */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name)
+ /* can only match entries that have a name */
+ if (!n->name || n->type != type)
continue;
- /* strcmp() is the more likely scenario */
- if (!strcmp(dname, n->name) ||
- !audit_compare_dname_path(dname, n->name, &dirlen)) {
- if (inode)
- audit_copy_inode(n, NULL, inode);
- else
- n->ino = (unsigned long)-1;
- found_child = n->name;
- goto add_names;
+ /* if we found a parent, make sure this one is a child of it */
+ if (found_parent && (n->name != found_parent->name))
+ continue;
+
+ if (!strcmp(dname, n->name->name) ||
+ !audit_compare_dname_path(dname, n->name->name,
+ found_parent ?
+ found_parent->name_len :
+ AUDIT_NAME_FULL)) {
+ found_child = n;
+ break;
}
}
-add_names:
if (!found_parent) {
- n = audit_alloc_name(context);
+ /* create a new, "anonymous" parent record */
+ n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
if (!n)
return;
audit_copy_inode(n, NULL, parent);
}
if (!found_child) {
- n = audit_alloc_name(context);
- if (!n)
+ found_child = audit_alloc_name(context, type);
+ if (!found_child)
return;
/* Re-use the name belonging to the slot for a matching parent
* directory. All names for this context are relinquished in
* audit_free_names() */
if (found_parent) {
- n->name = found_parent;
- n->name_len = AUDIT_NAME_FULL;
+ found_child->name = found_parent->name;
+ found_child->name_len = AUDIT_NAME_FULL;
/* don't call __putname() */
- n->name_put = false;
+ found_child->name_put = false;
}
-
- if (inode)
- audit_copy_inode(n, NULL, inode);
}
+ if (inode)
+ audit_copy_inode(found_child, dentry, inode);
+ else
+ found_child->ino = (unsigned long)-1;
}
EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2299,14 +2362,14 @@ static atomic_t session_id = ATOMIC_INIT(0);
*
* Called (set) from fs/proc/base.c::proc_loginuid_write().
*/
-int audit_set_loginuid(uid_t loginuid)
+int audit_set_loginuid(kuid_t loginuid)
{
struct task_struct *task = current;
struct audit_context *context = task->audit_context;
unsigned int sessionid;
#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
- if (task->loginuid != -1)
+ if (uid_valid(task->loginuid))
return -EPERM;
#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
if (!capable(CAP_AUDIT_CONTROL))
@@ -2322,8 +2385,10 @@ int audit_set_loginuid(uid_t loginuid)
audit_log_format(ab, "login pid=%d uid=%u "
"old auid=%u new auid=%u"
" old ses=%u new ses=%u",
- task->pid, task_uid(task),
- task->loginuid, loginuid,
+ task->pid,
+ from_kuid(&init_user_ns, task_uid(task)),
+ from_kuid(&init_user_ns, task->loginuid),
+ from_kuid(&init_user_ns, loginuid),
task->sessionid, sessionid);
audit_log_end(ab);
}
@@ -2546,12 +2611,12 @@ int __audit_signal_info(int sig, struct task_struct *t)
struct audit_aux_data_pids *axp;
struct task_struct *tsk = current;
struct audit_context *ctx = tsk->audit_context;
- uid_t uid = current_uid(), t_uid = task_uid(t);
+ kuid_t uid = current_uid(), t_uid = task_uid(t);
if (audit_pid && t->tgid == audit_pid) {
if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
audit_sig_pid = tsk->pid;
- if (tsk->loginuid != -1)
+ if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
audit_sig_uid = uid;
@@ -2672,8 +2737,8 @@ void __audit_mmap_fd(int fd, int flags)
static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
{
- uid_t auid, uid;
- gid_t gid;
+ kuid_t auid, uid;
+ kgid_t gid;
unsigned int sessionid;
auid = audit_get_loginuid(current);
@@ -2681,7 +2746,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
current_uid_gid(&uid, &gid);
audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
- auid, uid, gid, sessionid);
+ from_kuid(&init_user_ns, auid),
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid),
+ sessionid);
audit_log_task_context(ab);
audit_log_format(ab, " pid=%d comm=", current->pid);
audit_log_untrustedstring(ab, current->comm);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 79818507e444..13774b3b39aa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -88,11 +88,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
- * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
+ * populated with the built in subsystems, and modular subsystems are
* registered after that. The mutable section of this array is protected by
* cgroup_mutex.
*/
-#define SUBSYS(_x) &_x ## _subsys,
+#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
+#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
#include <linux/cgroup_subsys.h>
};
@@ -111,13 +112,13 @@ struct cgroupfs_root {
* The bitmask of subsystems intended to be attached to this
* hierarchy
*/
- unsigned long subsys_bits;
+ unsigned long subsys_mask;
/* Unique id for this hierarchy. */
int hierarchy_id;
/* The bitmask of subsystems currently attached to this hierarchy */
- unsigned long actual_subsys_bits;
+ unsigned long actual_subsys_mask;
/* A list running through the attached subsystems */
struct list_head subsys_list;
@@ -276,7 +277,8 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
/* bits in struct cgroupfs_root flags field */
enum {
- ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
+ ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
+ ROOT_XATTR, /* supports extended attributes */
};
static int cgroup_is_releasable(const struct cgroup *cgrp)
@@ -556,7 +558,7 @@ static struct css_set *find_existing_css_set(
* won't change, so no need for locking.
*/
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- if (root->subsys_bits & (1UL << i)) {
+ if (root->subsys_mask & (1UL << i)) {
/* Subsystem is in this hierarchy. So we want
* the subsystem state from the new
* cgroup */
@@ -824,7 +826,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
-static int cgroup_populate_dir(struct cgroup *cgrp);
+static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
+ unsigned long subsys_mask);
static const struct inode_operations cgroup_dir_inode_operations;
static const struct file_operations proc_cgroupstats_operations;
@@ -912,15 +915,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
*/
BUG_ON(!list_empty(&cgrp->pidlists));
+ simple_xattrs_free(&cgrp->xattrs);
+
kfree_rcu(cgrp, rcu_head);
} else {
struct cfent *cfe = __d_cfe(dentry);
struct cgroup *cgrp = dentry->d_parent->d_fsdata;
+ struct cftype *cft = cfe->type;
WARN_ONCE(!list_empty(&cfe->node) &&
cgrp != &cgrp->root->top_cgroup,
"cfe still linked for %s\n", cfe->type->name);
kfree(cfe);
+ simple_xattrs_free(&cft->xattrs);
}
iput(inode);
}
@@ -963,12 +970,29 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
return -ENOENT;
}
-static void cgroup_clear_directory(struct dentry *dir)
+/**
+ * cgroup_clear_directory - selective removal of base and subsystem files
+ * @dir: directory containing the files
+ * @base_files: true if the base files should be removed
+ * @subsys_mask: mask of the subsystem ids whose files should be removed
+ */
+static void cgroup_clear_directory(struct dentry *dir, bool base_files,
+ unsigned long subsys_mask)
{
struct cgroup *cgrp = __d_cgrp(dir);
+ struct cgroup_subsys *ss;
- while (!list_empty(&cgrp->files))
- cgroup_rm_file(cgrp, NULL);
+ for_each_subsys(cgrp->root, ss) {
+ struct cftype_set *set;
+ if (!test_bit(ss->subsys_id, &subsys_mask))
+ continue;
+ list_for_each_entry(set, &ss->cftsets, node)
+ cgroup_rm_file(cgrp, set->cfts);
+ }
+ if (base_files) {
+ while (!list_empty(&cgrp->files))
+ cgroup_rm_file(cgrp, NULL);
+ }
}
/*
@@ -977,8 +1001,9 @@ static void cgroup_clear_directory(struct dentry *dir)
static void cgroup_d_remove_dir(struct dentry *dentry)
{
struct dentry *parent;
+ struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
- cgroup_clear_directory(dentry);
+ cgroup_clear_directory(dentry, true, root->subsys_mask);
parent = dentry->d_parent;
spin_lock(&parent->d_lock);
@@ -1022,22 +1047,22 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
* returns an error, no reference counts are touched.
*/
static int rebind_subsystems(struct cgroupfs_root *root,
- unsigned long final_bits)
+ unsigned long final_subsys_mask)
{
- unsigned long added_bits, removed_bits;
+ unsigned long added_mask, removed_mask;
struct cgroup *cgrp = &root->top_cgroup;
int i;
BUG_ON(!mutex_is_locked(&cgroup_mutex));
BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
- removed_bits = root->actual_subsys_bits & ~final_bits;
- added_bits = final_bits & ~root->actual_subsys_bits;
+ removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
+ added_mask = final_subsys_mask & ~root->actual_subsys_mask;
/* Check that any added subsystems are currently free */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
unsigned long bit = 1UL << i;
struct cgroup_subsys *ss = subsys[i];
- if (!(bit & added_bits))
+ if (!(bit & added_mask))
continue;
/*
* Nobody should tell us to do a subsys that doesn't exist:
@@ -1062,7 +1087,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
unsigned long bit = 1UL << i;
- if (bit & added_bits) {
+ if (bit & added_mask) {
/* We're binding this subsystem to this hierarchy */
BUG_ON(ss == NULL);
BUG_ON(cgrp->subsys[i]);
@@ -1075,7 +1100,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
if (ss->bind)
ss->bind(cgrp);
/* refcount was already taken, and we're keeping it */
- } else if (bit & removed_bits) {
+ } else if (bit & removed_mask) {
/* We're removing this subsystem */
BUG_ON(ss == NULL);
BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
@@ -1088,7 +1113,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
list_move(&ss->sibling, &rootnode.subsys_list);
/* subsystem is now free - drop reference on module */
module_put(ss->module);
- } else if (bit & final_bits) {
+ } else if (bit & final_subsys_mask) {
/* Subsystem state should already exist */
BUG_ON(ss == NULL);
BUG_ON(!cgrp->subsys[i]);
@@ -1105,7 +1130,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
BUG_ON(cgrp->subsys[i]);
}
}
- root->subsys_bits = root->actual_subsys_bits = final_bits;
+ root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
synchronize_rcu();
return 0;
@@ -1121,6 +1146,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
seq_printf(seq, ",%s", ss->name);
if (test_bit(ROOT_NOPREFIX, &root->flags))
seq_puts(seq, ",noprefix");
+ if (test_bit(ROOT_XATTR, &root->flags))
+ seq_puts(seq, ",xattr");
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
if (clone_children(&root->top_cgroup))
@@ -1132,7 +1159,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
}
struct cgroup_sb_opts {
- unsigned long subsys_bits;
+ unsigned long subsys_mask;
unsigned long flags;
char *release_agent;
bool clone_children;
@@ -1189,6 +1216,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
opts->clone_children = true;
continue;
}
+ if (!strcmp(token, "xattr")) {
+ set_bit(ROOT_XATTR, &opts->flags);
+ continue;
+ }
if (!strncmp(token, "release_agent=", 14)) {
/* Specifying two release agents is forbidden */
if (opts->release_agent)
@@ -1237,7 +1268,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
/* Mutually exclusive option 'all' + subsystem name */
if (all_ss)
return -EINVAL;
- set_bit(i, &opts->subsys_bits);
+ set_bit(i, &opts->subsys_mask);
one_ss = true;
break;
@@ -1258,7 +1289,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
continue;
if (ss->disabled)
continue;
- set_bit(i, &opts->subsys_bits);
+ set_bit(i, &opts->subsys_mask);
}
}
@@ -1270,19 +1301,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
* the cpuset subsystem.
*/
if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
- (opts->subsys_bits & mask))
+ (opts->subsys_mask & mask))
return -EINVAL;
/* Can't specify "none" and some subsystems */
- if (opts->subsys_bits && opts->none)
+ if (opts->subsys_mask && opts->none)
return -EINVAL;
/*
* We either have to specify by name or by subsystems. (So all
* empty hierarchies must have a name).
*/
- if (!opts->subsys_bits && !opts->name)
+ if (!opts->subsys_mask && !opts->name)
return -EINVAL;
/*
@@ -1291,10 +1322,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
* take duplicate reference counts on a subsystem that's already used,
* but rebind_subsystems handles this case.
*/
- for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
unsigned long bit = 1UL << i;
- if (!(bit & opts->subsys_bits))
+ if (!(bit & opts->subsys_mask))
continue;
if (!try_module_get(subsys[i]->module)) {
module_pin_failed = true;
@@ -1307,11 +1338,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
* raced with a module_delete call, and to the user this is
* essentially a "subsystem doesn't exist" case.
*/
- for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
+ for (i--; i >= 0; i--) {
/* drop refcounts only on the ones we took */
unsigned long bit = 1UL << i;
- if (!(bit & opts->subsys_bits))
+ if (!(bit & opts->subsys_mask))
continue;
module_put(subsys[i]->module);
}
@@ -1321,13 +1352,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return 0;
}
-static void drop_parsed_module_refcounts(unsigned long subsys_bits)
+static void drop_parsed_module_refcounts(unsigned long subsys_mask)
{
int i;
- for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
unsigned long bit = 1UL << i;
- if (!(bit & subsys_bits))
+ if (!(bit & subsys_mask))
continue;
module_put(subsys[i]->module);
}
@@ -1339,6 +1370,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
struct cgroupfs_root *root = sb->s_fs_info;
struct cgroup *cgrp = &root->top_cgroup;
struct cgroup_sb_opts opts;
+ unsigned long added_mask, removed_mask;
mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
@@ -1350,27 +1382,31 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
goto out_unlock;
/* See feature-removal-schedule.txt */
- if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
+ if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
task_tgid_nr(current), current->comm);
+ added_mask = opts.subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
/* Don't allow flags or name to change at remount */
if (opts.flags != root->flags ||
(opts.name && strcmp(opts.name, root->name))) {
ret = -EINVAL;
- drop_parsed_module_refcounts(opts.subsys_bits);
+ drop_parsed_module_refcounts(opts.subsys_mask);
goto out_unlock;
}
- ret = rebind_subsystems(root, opts.subsys_bits);
+ ret = rebind_subsystems(root, opts.subsys_mask);
if (ret) {
- drop_parsed_module_refcounts(opts.subsys_bits);
+ drop_parsed_module_refcounts(opts.subsys_mask);
goto out_unlock;
}
/* clear out any existing files and repopulate subsystem files */
- cgroup_clear_directory(cgrp->dentry);
- cgroup_populate_dir(cgrp);
+ cgroup_clear_directory(cgrp->dentry, false, removed_mask);
+ /* re-populate subsystem files */
+ cgroup_populate_dir(cgrp, false, added_mask);
if (opts.release_agent)
strcpy(root->release_agent_path, opts.release_agent);
@@ -1401,6 +1437,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
mutex_init(&cgrp->pidlist_mutex);
INIT_LIST_HEAD(&cgrp->event_list);
spin_lock_init(&cgrp->event_list_lock);
+ simple_xattrs_init(&cgrp->xattrs);
}
static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1455,8 +1492,8 @@ static int cgroup_test_super(struct super_block *sb, void *data)
* If we asked for subsystems (or explicitly for no
* subsystems) then they must match
*/
- if ((opts->subsys_bits || opts->none)
- && (opts->subsys_bits != root->subsys_bits))
+ if ((opts->subsys_mask || opts->none)
+ && (opts->subsys_mask != root->subsys_mask))
return 0;
return 1;
@@ -1466,7 +1503,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
{
struct cgroupfs_root *root;
- if (!opts->subsys_bits && !opts->none)
+ if (!opts->subsys_mask && !opts->none)
return NULL;
root = kzalloc(sizeof(*root), GFP_KERNEL);
@@ -1479,7 +1516,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
}
init_cgroup_root(root);
- root->subsys_bits = opts->subsys_bits;
+ root->subsys_mask = opts->subsys_mask;
root->flags = opts->flags;
if (opts->release_agent)
strcpy(root->release_agent_path, opts->release_agent);
@@ -1511,7 +1548,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
if (!opts->new_root)
return -EINVAL;
- BUG_ON(!opts->subsys_bits && !opts->none);
+ BUG_ON(!opts->subsys_mask && !opts->none);
ret = set_anon_super(sb, NULL);
if (ret)
@@ -1629,7 +1666,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (ret)
goto unlock_drop;
- ret = rebind_subsystems(root, root->subsys_bits);
+ ret = rebind_subsystems(root, root->subsys_mask);
if (ret == -EBUSY) {
free_cg_links(&tmp_cg_links);
goto unlock_drop;
@@ -1669,7 +1706,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
BUG_ON(root->number_of_cgroups != 1);
cred = override_creds(&init_cred);
- cgroup_populate_dir(root_cgrp);
+ cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
revert_creds(cred);
mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
@@ -1681,7 +1718,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
*/
cgroup_drop_root(opts.new_root);
/* no subsys rebinding, so refcounts don't change */
- drop_parsed_module_refcounts(opts.subsys_bits);
+ drop_parsed_module_refcounts(opts.subsys_mask);
}
kfree(opts.release_agent);
@@ -1695,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
drop_new_super:
deactivate_locked_super(sb);
drop_modules:
- drop_parsed_module_refcounts(opts.subsys_bits);
+ drop_parsed_module_refcounts(opts.subsys_mask);
out_err:
kfree(opts.release_agent);
kfree(opts.name);
@@ -1745,6 +1782,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
+ simple_xattrs_free(&cgrp->xattrs);
+
kill_litter_super(sb);
cgroup_drop_root(root);
}
@@ -2551,6 +2590,64 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
}
+static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
+{
+ if (S_ISDIR(dentry->d_inode->i_mode))
+ return &__d_cgrp(dentry)->xattrs;
+ else
+ return &__d_cft(dentry)->xattrs;
+}
+
+static inline int xattr_enabled(struct dentry *dentry)
+{
+ struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
+ return test_bit(ROOT_XATTR, &root->flags);
+}
+
+static bool is_valid_xattr(const char *name)
+{
+ if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+ !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
+ return true;
+ return false;
+}
+
+static int cgroup_setxattr(struct dentry *dentry, const char *name,
+ const void *val, size_t size, int flags)
+{
+ if (!xattr_enabled(dentry))
+ return -EOPNOTSUPP;
+ if (!is_valid_xattr(name))
+ return -EINVAL;
+ return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
+}
+
+static int cgroup_removexattr(struct dentry *dentry, const char *name)
+{
+ if (!xattr_enabled(dentry))
+ return -EOPNOTSUPP;
+ if (!is_valid_xattr(name))
+ return -EINVAL;
+ return simple_xattr_remove(__d_xattrs(dentry), name);
+}
+
+static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
+ void *buf, size_t size)
+{
+ if (!xattr_enabled(dentry))
+ return -EOPNOTSUPP;
+ if (!is_valid_xattr(name))
+ return -EINVAL;
+ return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
+}
+
+static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+ if (!xattr_enabled(dentry))
+ return -EOPNOTSUPP;
+ return simple_xattr_list(__d_xattrs(dentry), buf, size);
+}
+
static const struct file_operations cgroup_file_operations = {
.read = cgroup_file_read,
.write = cgroup_file_write,
@@ -2559,11 +2656,22 @@ static const struct file_operations cgroup_file_operations = {
.release = cgroup_file_release,
};
+static const struct inode_operations cgroup_file_inode_operations = {
+ .setxattr = cgroup_setxattr,
+ .getxattr = cgroup_getxattr,
+ .listxattr = cgroup_listxattr,
+ .removexattr = cgroup_removexattr,
+};
+
static const struct inode_operations cgroup_dir_inode_operations = {
.lookup = cgroup_lookup,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.rename = cgroup_rename,
+ .setxattr = cgroup_setxattr,
+ .getxattr = cgroup_getxattr,
+ .listxattr = cgroup_listxattr,
+ .removexattr = cgroup_removexattr,
};
static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -2611,6 +2719,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
} else if (S_ISREG(mode)) {
inode->i_size = 0;
inode->i_fop = &cgroup_file_operations;
+ inode->i_op = &cgroup_file_inode_operations;
}
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
@@ -2671,7 +2780,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
}
static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
- const struct cftype *cft)
+ struct cftype *cft)
{
struct dentry *dir = cgrp->dentry;
struct cgroup *parent = __d_cgrp(dir);
@@ -2681,6 +2790,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
umode_t mode;
char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
+ simple_xattrs_init(&cft->xattrs);
+
/* does @cft->flags tell us to skip creation on @cgrp? */
if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
return 0;
@@ -2721,9 +2832,9 @@ out:
}
static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
- const struct cftype cfts[], bool is_add)
+ struct cftype cfts[], bool is_add)
{
- const struct cftype *cft;
+ struct cftype *cft;
int err, ret = 0;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
@@ -2757,7 +2868,7 @@ static void cgroup_cfts_prepare(void)
}
static void cgroup_cfts_commit(struct cgroup_subsys *ss,
- const struct cftype *cfts, bool is_add)
+ struct cftype *cfts, bool is_add)
__releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
{
LIST_HEAD(pending);
@@ -2808,7 +2919,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
-int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype_set *set;
@@ -2838,7 +2949,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
* Returns 0 on successful unregistration, -ENOENT if @cfts is not
* registered with @ss.
*/
-int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype_set *set;
@@ -3843,18 +3954,29 @@ static struct cftype files[] = {
{ } /* terminate */
};
-static int cgroup_populate_dir(struct cgroup *cgrp)
+/**
+ * cgroup_populate_dir - selectively creation of files in a directory
+ * @cgrp: target cgroup
+ * @base_files: true if the base files should be added
+ * @subsys_mask: mask of the subsystem ids whose files should be added
+ */
+static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
+ unsigned long subsys_mask)
{
int err;
struct cgroup_subsys *ss;
- err = cgroup_addrm_files(cgrp, NULL, files, true);
- if (err < 0)
- return err;
+ if (base_files) {
+ err = cgroup_addrm_files(cgrp, NULL, files, true);
+ if (err < 0)
+ return err;
+ }
/* process cftsets of each subsystem */
for_each_subsys(cgrp->root, ss) {
struct cftype_set *set;
+ if (!test_bit(ss->subsys_id, &subsys_mask))
+ continue;
list_for_each_entry(set, &ss->cftsets, node)
cgroup_addrm_files(cgrp, ss, set->cfts, true);
@@ -3954,8 +4076,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
for_each_subsys(root, ss) {
- struct cgroup_subsys_state *css = ss->create(cgrp);
+ struct cgroup_subsys_state *css;
+ css = ss->create(cgrp);
if (IS_ERR(css)) {
err = PTR_ERR(css);
goto err_destroy;
@@ -3969,6 +4092,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
/* At error, ->destroy() callback has to free assigned ID. */
if (clone_children(parent) && ss->post_clone)
ss->post_clone(cgrp);
+
+ if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+ parent->parent) {
+ pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+ current->comm, current->pid, ss->name);
+ if (!strcmp(ss->name, "memory"))
+ pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
+ ss->warned_broken_hierarchy = true;
+ }
}
list_add(&cgrp->sibling, &cgrp->parent->children);
@@ -3988,7 +4120,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
list_add_tail(&cgrp->allcg_node, &root->allcg_list);
- err = cgroup_populate_dir(cgrp);
+ err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
/* If err < 0, we have a half-filled directory - oh well ;) */
mutex_unlock(&cgroup_mutex);
@@ -4321,8 +4453,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
* since cgroup_init_subsys will have already taken care of it.
*/
if (ss->module == NULL) {
- /* a few sanity checks */
- BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
+ /* a sanity check */
BUG_ON(subsys[ss->subsys_id] != ss);
return 0;
}
@@ -4330,24 +4461,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
/* init base cftset */
cgroup_init_cftsets(ss);
- /*
- * need to register a subsys id before anything else - for example,
- * init_cgroup_css needs it.
- */
mutex_lock(&cgroup_mutex);
- /* find the first empty slot in the array */
- for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
- if (subsys[i] == NULL)
- break;
- }
- if (i == CGROUP_SUBSYS_COUNT) {
- /* maximum number of subsystems already registered! */
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- /* assign ourselves the subsys_id */
- ss->subsys_id = i;
- subsys[i] = ss;
+ subsys[ss->subsys_id] = ss;
/*
* no ss->create seems to need anything important in the ss struct, so
@@ -4356,7 +4471,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
css = ss->create(dummytop);
if (IS_ERR(css)) {
/* failure case - need to deassign the subsys[] slot. */
- subsys[i] = NULL;
+ subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
return PTR_ERR(css);
}
@@ -4372,7 +4487,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
if (ret) {
dummytop->subsys[ss->subsys_id] = NULL;
ss->destroy(dummytop);
- subsys[i] = NULL;
+ subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
return ret;
}
@@ -4439,7 +4554,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
mutex_lock(&cgroup_mutex);
/* deassign the subsys_id */
- BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
subsys[ss->subsys_id] = NULL;
/* remove subsystem from rootnode's list of subsystems */
@@ -4502,10 +4616,13 @@ int __init cgroup_init_early(void)
for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&css_set_table[i]);
- /* at bootup time, we don't worry about modular subsystems */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+ /* at bootup time, we don't worry about modular subsystems */
+ if (!ss || ss->module)
+ continue;
+
BUG_ON(!ss->name);
BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
BUG_ON(!ss->create);
@@ -4538,9 +4655,12 @@ int __init cgroup_init(void)
if (err)
return err;
- /* at bootup time, we don't worry about modular subsystems */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+
+ /* at bootup time, we don't worry about modular subsystems */
+ if (!ss || ss->module)
+ continue;
if (!ss->early_init)
cgroup_init_subsys(ss);
if (ss->use_id)
@@ -4735,13 +4855,16 @@ void cgroup_fork_callbacks(struct task_struct *child)
{
if (need_forkexit_callback) {
int i;
- /*
- * forkexit callbacks are only supported for builtin
- * subsystems, and the builtin section of the subsys array is
- * immutable, so we don't need to lock the subsys array here.
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+
+ /*
+ * forkexit callbacks are only supported for
+ * builtin subsystems.
+ */
+ if (!ss || ss->module)
+ continue;
+
if (ss->fork)
ss->fork(child);
}
@@ -4846,12 +4969,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
tsk->cgroups = &init_css_set;
if (run_callbacks && need_forkexit_callback) {
- /*
- * modular subsystems can't use callbacks, so no need to lock
- * the subsys array
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+
+ /* modular subsystems can't use callbacks */
+ if (!ss || ss->module)
+ continue;
+
if (ss->exit) {
struct cgroup *old_cgrp =
rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -5037,13 +5161,17 @@ static int __init cgroup_disable(char *str)
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
- /*
- * cgroup_disable, being at boot time, can't know about module
- * subsystems, so we don't worry about them.
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+ /*
+ * cgroup_disable, being at boot time, can't
+ * know about module subsystems, so we don't
+ * worry about them.
+ */
+ if (!ss || ss->module)
+ continue;
+
if (!strcmp(token, ss->name)) {
ss->disabled = 1;
printk(KERN_INFO "Disabling %s control group"
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 3649fc6b3eaa..b1724ce98981 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -373,4 +373,12 @@ struct cgroup_subsys freezer_subsys = {
.can_attach = freezer_can_attach,
.fork = freezer_fork,
.base_cftypes = files,
+
+ /*
+ * freezer subsys doesn't handle hierarchy at all. Frozen state
+ * should be inherited through the hierarchy - if a parent is
+ * frozen, all its children should be frozen. Fix it and remove
+ * the following.
+ */
+ .broken_hierarchy = true,
};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f560598807c1..42bd331ee0ab 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,6 +80,10 @@ void put_online_cpus(void)
if (cpu_hotplug.active_writer == current)
return;
mutex_lock(&cpu_hotplug.lock);
+
+ if (WARN_ON(!cpu_hotplug.refcount))
+ cpu_hotplug.refcount++; /* try to fix things up */
+
if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
wake_up_process(cpu_hotplug.active_writer);
mutex_unlock(&cpu_hotplug.lock);
diff --git a/kernel/cred.c b/kernel/cred.c
index de728ac50d82..48cea3da6d05 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -799,9 +799,15 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
atomic_read(&cred->usage),
read_cred_subscribers(cred));
printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
- cred->uid, cred->euid, cred->suid, cred->fsuid);
+ from_kuid_munged(&init_user_ns, cred->uid),
+ from_kuid_munged(&init_user_ns, cred->euid),
+ from_kuid_munged(&init_user_ns, cred->suid),
+ from_kuid_munged(&init_user_ns, cred->fsuid));
printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
- cred->gid, cred->egid, cred->sgid, cred->fsgid);
+ from_kgid_munged(&init_user_ns, cred->gid),
+ from_kgid_munged(&init_user_ns, cred->egid),
+ from_kgid_munged(&init_user_ns, cred->sgid),
+ from_kgid_munged(&init_user_ns, cred->fsgid));
#ifdef CONFIG_SECURITY
printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
if ((unsigned long) cred->security >= PAGE_SIZE &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0557f24c6bca..9a61738cefc8 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -672,6 +672,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
{
struct kgdb_state kgdb_var;
struct kgdb_state *ks = &kgdb_var;
+ int ret = 0;
+
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(0);
ks->cpu = raw_smp_processor_id();
ks->ex_vector = evector;
@@ -681,13 +685,33 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
ks->linux_regs = regs;
if (kgdb_reenter_check(ks))
- return 0; /* Ouch, double exception ! */
+ goto out; /* Ouch, double exception ! */
if (kgdb_info[ks->cpu].enter_kgdb != 0)
- return 0;
+ goto out;
- return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+ ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+out:
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(1);
+ return ret;
}
+/*
+ * GDB places a breakpoint at this function to know dynamically
+ * loaded objects. It's not defined static so that only one instance with this
+ * name exists in the kernel.
+ */
+
+static int module_event(struct notifier_block *self, unsigned long val,
+ void *data)
+{
+ return 0;
+}
+
+static struct notifier_block dbg_module_load_nb = {
+ .notifier_call = module_event,
+};
+
int kgdb_nmicallback(int cpu, void *regs)
{
#ifdef CONFIG_SMP
@@ -816,6 +840,7 @@ static void kgdb_register_callbacks(void)
kgdb_arch_init();
if (!dbg_is_early)
kgdb_arch_late();
+ register_module_notifier(&dbg_module_load_nb);
register_reboot_notifier(&dbg_reboot_notifier);
atomic_notifier_chain_register(&panic_notifier_list,
&kgdb_panic_event_nb);
@@ -839,6 +864,7 @@ static void kgdb_unregister_callbacks(void)
if (kgdb_io_module_registered) {
kgdb_io_module_registered = 0;
unregister_reboot_notifier(&dbg_reboot_notifier);
+ unregister_module_notifier(&dbg_module_load_nb);
atomic_notifier_chain_unregister(&panic_notifier_list,
&kgdb_panic_event_nb);
kgdb_arch_exit();
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 07c9bbb94a0b..b03e0e814e43 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -129,6 +129,8 @@ kdb_bt(int argc, const char **argv)
}
/* Now the inactive tasks */
kdb_do_each_thread(g, p) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
if (task_curr(p))
continue;
if (kdb_bt1(p, mask, argcount, btaprompt))
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 0a69d2adc4f3..14ff4849262c 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -552,6 +552,7 @@ int vkdb_printf(const char *fmt, va_list ap)
{
int diag;
int linecount;
+ int colcount;
int logging, saved_loglevel = 0;
int saved_trap_printk;
int got_printf_lock = 0;
@@ -584,6 +585,10 @@ int vkdb_printf(const char *fmt, va_list ap)
if (diag || linecount <= 1)
linecount = 24;
+ diag = kdbgetintenv("COLUMNS", &colcount);
+ if (diag || colcount <= 1)
+ colcount = 80;
+
diag = kdbgetintenv("LOGGING", &logging);
if (diag)
logging = 0;
@@ -690,7 +695,7 @@ kdb_printit:
gdbstub_msg_write(kdb_buffer, retlen);
} else {
if (dbg_io_ops && !dbg_io_ops->is_console) {
- len = strlen(kdb_buffer);
+ len = retlen;
cp = kdb_buffer;
while (len--) {
dbg_io_ops->write_char(*cp);
@@ -709,11 +714,29 @@ kdb_printit:
printk(KERN_INFO "%s", kdb_buffer);
}
- if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
- kdb_nextline++;
+ if (KDB_STATE(PAGER)) {
+ /*
+ * Check printed string to decide how to bump the
+ * kdb_nextline to control when the more prompt should
+ * show up.
+ */
+ int got = 0;
+ len = retlen;
+ while (len--) {
+ if (kdb_buffer[len] == '\n') {
+ kdb_nextline++;
+ got = 0;
+ } else if (kdb_buffer[len] == '\r') {
+ got = 0;
+ } else {
+ got++;
+ }
+ }
+ kdb_nextline += got / (colcount + 1);
+ }
/* check for having reached the LINES number of printed lines */
- if (kdb_nextline == linecount) {
+ if (kdb_nextline >= linecount) {
char buf1[16] = "";
/* Watch out for recursion here. Any routine that calls
@@ -765,7 +788,7 @@ kdb_printit:
kdb_grepping_flag = 0;
kdb_printf("\n");
} else if (buf1[0] == ' ') {
- kdb_printf("\n");
+ kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
} else if (buf1[0] == '\n') {
kdb_nextline = linecount - 1;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 31df1706b9a9..4d5f8d5612f3 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -21,6 +21,7 @@
#include <linux/smp.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
+#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/init.h>
@@ -2100,6 +2101,8 @@ static int kdb_dmesg(int argc, const char **argv)
}
if (!lines--)
break;
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
kdb_printf("%.*s\n", (int)len - 1, buf);
}
@@ -2107,6 +2110,32 @@ static int kdb_dmesg(int argc, const char **argv)
return 0;
}
#endif /* CONFIG_PRINTK */
+
+/* Make sure we balance enable/disable calls, must disable first. */
+static atomic_t kdb_nmi_disabled;
+
+static int kdb_disable_nmi(int argc, const char *argv[])
+{
+ if (atomic_read(&kdb_nmi_disabled))
+ return 0;
+ atomic_set(&kdb_nmi_disabled, 1);
+ arch_kgdb_ops.enable_nmi(0);
+ return 0;
+}
+
+static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
+{
+ if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
+ return -EINVAL;
+ arch_kgdb_ops.enable_nmi(1);
+ return 0;
+}
+
+static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
+ .set = kdb_param_enable_nmi,
+};
+module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
+
/*
* kdb_cpu - This function implements the 'cpu' command.
* cpu [<cpunum>]
@@ -2851,6 +2880,10 @@ static void __init kdb_inittab(void)
kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
"Display syslog buffer", 0, KDB_REPEAT_NONE);
#endif
+ if (arch_kgdb_ops.enable_nmi) {
+ kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
+ "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
+ }
kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
"Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fd15593c7f54..dbccf83c134d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -471,14 +471,13 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
{
struct perf_cgroup *cgrp;
struct cgroup_subsys_state *css;
- struct file *file;
- int ret = 0, fput_needed;
+ struct fd f = fdget(fd);
+ int ret = 0;
- file = fget_light(fd, &fput_needed);
- if (!file)
+ if (!f.file)
return -EBADF;
- css = cgroup_css_from_dir(file, perf_subsys_id);
+ css = cgroup_css_from_dir(f.file, perf_subsys_id);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
goto out;
@@ -504,7 +503,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
ret = -EINVAL;
}
out:
- fput_light(file, fput_needed);
+ fdput(f);
return ret;
}
@@ -3237,21 +3236,18 @@ unlock:
static const struct file_operations perf_fops;
-static struct file *perf_fget_light(int fd, int *fput_needed)
+static inline int perf_fget_light(int fd, struct fd *p)
{
- struct file *file;
-
- file = fget_light(fd, fput_needed);
- if (!file)
- return ERR_PTR(-EBADF);
+ struct fd f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
- if (file->f_op != &perf_fops) {
- fput_light(file, *fput_needed);
- *fput_needed = 0;
- return ERR_PTR(-EBADF);
+ if (f.file->f_op != &perf_fops) {
+ fdput(f);
+ return -EBADF;
}
-
- return file;
+ *p = f;
+ return 0;
}
static int perf_event_set_output(struct perf_event *event,
@@ -3283,22 +3279,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_EVENT_IOC_SET_OUTPUT:
{
- struct file *output_file = NULL;
- struct perf_event *output_event = NULL;
- int fput_needed = 0;
int ret;
-
if (arg != -1) {
- output_file = perf_fget_light(arg, &fput_needed);
- if (IS_ERR(output_file))
- return PTR_ERR(output_file);
- output_event = output_file->private_data;
+ struct perf_event *output_event;
+ struct fd output;
+ ret = perf_fget_light(arg, &output);
+ if (ret)
+ return ret;
+ output_event = output.file->private_data;
+ ret = perf_event_set_output(event, output_event);
+ fdput(output);
+ } else {
+ ret = perf_event_set_output(event, NULL);
}
-
- ret = perf_event_set_output(event, output_event);
- if (output_event)
- fput_light(output_file, fput_needed);
-
return ret;
}
@@ -3681,7 +3674,7 @@ unlock:
atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);
- vma->vm_flags |= VM_RESERVED;
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;
return ret;
@@ -6446,12 +6439,11 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr attr;
struct perf_event_context *ctx;
struct file *event_file = NULL;
- struct file *group_file = NULL;
+ struct fd group = {NULL, 0};
struct task_struct *task = NULL;
struct pmu *pmu;
int event_fd;
int move_group = 0;
- int fput_needed = 0;
int err;
/* for future expandability... */
@@ -6481,17 +6473,15 @@ SYSCALL_DEFINE5(perf_event_open,
if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
return -EINVAL;
- event_fd = get_unused_fd_flags(O_RDWR);
+ event_fd = get_unused_fd();
if (event_fd < 0)
return event_fd;
if (group_fd != -1) {
- group_file = perf_fget_light(group_fd, &fput_needed);
- if (IS_ERR(group_file)) {
- err = PTR_ERR(group_file);
+ err = perf_fget_light(group_fd, &group);
+ if (err)
goto err_fd;
- }
- group_leader = group_file->private_data;
+ group_leader = group.file->private_data;
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6667,7 +6657,7 @@ SYSCALL_DEFINE5(perf_event_open,
* of the group leader will find the pointer to itself in
* perf_group_detach().
*/
- fput_light(group_file, fput_needed);
+ fdput(group);
fd_install(event_fd, event_file);
return event_fd;
@@ -6681,7 +6671,7 @@ err_task:
if (task)
put_task_struct(task);
err_group_fd:
- fput_light(group_file, fput_needed);
+ fdput(group);
err_fd:
put_unused_fd(event_fd);
return err;
@@ -7506,5 +7496,12 @@ struct cgroup_subsys perf_subsys = {
.destroy = perf_cgroup_destroy,
.exit = perf_cgroup_exit,
.attach = perf_cgroup_attach,
+
+ /*
+ * perf_event cgroup doesn't handle nesting correctly.
+ * ctx->nr_cgroups adjustments should be propagated through the
+ * cgroup hierarchy. Fix it and remove the following.
+ */
+ .broken_hierarchy = true,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 912ef48d28ab..5cc4e7e42e68 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -78,15 +78,23 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
*/
static atomic_t uprobe_events = ATOMIC_INIT(0);
+/* Have a copy of original instruction */
+#define UPROBE_COPY_INSN 0
+/* Dont run handlers when first register/ last unregister in progress*/
+#define UPROBE_RUN_HANDLER 1
+/* Can skip singlestep */
+#define UPROBE_SKIP_SSTEP 2
+
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
atomic_t ref;
struct rw_semaphore consumer_rwsem;
+ struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
struct list_head pending_list;
struct uprobe_consumer *consumers;
struct inode *inode; /* Also hold a ref to inode */
loff_t offset;
- int flags;
+ unsigned long flags;
struct arch_uprobe arch;
};
@@ -100,17 +108,12 @@ struct uprobe {
*/
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
- if (!vma->vm_file)
- return false;
-
- if (!is_register)
- return true;
+ vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
- if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED))
- == (VM_READ|VM_EXEC))
- return true;
+ if (is_register)
+ flags |= VM_WRITE;
- return false;
+ return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
}
static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
@@ -141,10 +144,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
spinlock_t *ptl;
pte_t *ptep;
int err;
+ /* For mmu_notifiers */
+ const unsigned long mmun_start = addr;
+ const unsigned long mmun_end = addr + PAGE_SIZE;
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(page);
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
@@ -173,6 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
}
@@ -188,19 +196,44 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
return *insn == UPROBE_SWBP_INSN;
}
+static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
+{
+ void *kaddr = kmap_atomic(page);
+ memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE);
+ kunmap_atomic(kaddr);
+}
+
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+{
+ uprobe_opcode_t old_opcode;
+ bool is_swbp;
+
+ copy_opcode(page, vaddr, &old_opcode);
+ is_swbp = is_swbp_insn(&old_opcode);
+
+ if (is_swbp_insn(new_opcode)) {
+ if (is_swbp) /* register: already installed? */
+ return 0;
+ } else {
+ if (!is_swbp) /* unregister: was it changed by us? */
+ return 0;
+ }
+
+ return 1;
+}
+
/*
* NOTE:
* Expect the breakpoint instruction to be the smallest size instruction for
* the architecture. If an arch has variable length instruction and the
* breakpoint instruction is not of the smallest length instruction
- * supported by that architecture then we need to modify read_opcode /
+ * supported by that architecture then we need to modify is_swbp_at_addr and
* write_opcode accordingly. This would never be a problem for archs that
* have fixed length instructions.
*/
/*
* write_opcode - write the opcode at a given virtual address.
- * @auprobe: arch breakpointing information.
* @mm: the probed process address space.
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
@@ -211,8 +244,8 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
* For mm @mm, write the opcode at @vaddr.
* Return 0 (success) or a negative errno.
*/
-static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
- unsigned long vaddr, uprobe_opcode_t opcode)
+static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
+ uprobe_opcode_t opcode)
{
struct page *old_page, *new_page;
void *vaddr_old, *vaddr_new;
@@ -221,10 +254,14 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
retry:
/* Read the page with vaddr into memory */
- ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
+ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
if (ret <= 0)
return ret;
+ ret = verify_opcode(old_page, vaddr, &opcode);
+ if (ret <= 0)
+ goto put_old;
+
ret = -ENOMEM;
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
if (!new_page)
@@ -259,63 +296,6 @@ put_old:
}
/**
- * read_opcode - read the opcode at a given virtual address.
- * @mm: the probed process address space.
- * @vaddr: the virtual address to read the opcode.
- * @opcode: location to store the read opcode.
- *
- * Called with mm->mmap_sem held (for read and with a reference to
- * mm.
- *
- * For mm @mm, read the opcode at @vaddr and store it in @opcode.
- * Return 0 (success) or a negative errno.
- */
-static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode)
-{
- struct page *page;
- void *vaddr_new;
- int ret;
-
- ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
- if (ret <= 0)
- return ret;
-
- vaddr_new = kmap_atomic(page);
- vaddr &= ~PAGE_MASK;
- memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
- kunmap_atomic(vaddr_new);
-
- put_page(page);
-
- return 0;
-}
-
-static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
-{
- uprobe_opcode_t opcode;
- int result;
-
- if (current->mm == mm) {
- pagefault_disable();
- result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
- sizeof(opcode));
- pagefault_enable();
-
- if (likely(result == 0))
- goto out;
- }
-
- result = read_opcode(mm, vaddr, &opcode);
- if (result)
- return result;
-out:
- if (is_swbp_insn(&opcode))
- return 1;
-
- return 0;
-}
-
-/**
* set_swbp - store breakpoint at a given address.
* @auprobe: arch specific probepoint information.
* @mm: the probed process address space.
@@ -326,18 +306,7 @@ out:
*/
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- int result;
- /*
- * See the comment near uprobes_hash().
- */
- result = is_swbp_at_addr(mm, vaddr);
- if (result == 1)
- return 0;
-
- if (result)
- return result;
-
- return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
+ return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
}
/**
@@ -352,16 +321,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
int __weak
set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- int result;
-
- result = is_swbp_at_addr(mm, vaddr);
- if (!result)
- return -EINVAL;
-
- if (result != 1)
- return result;
-
- return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+ return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
}
static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -468,7 +428,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
spin_unlock(&uprobes_treelock);
/* For now assume that the instruction need not be single-stepped */
- uprobe->flags |= UPROBE_SKIP_SSTEP;
+ __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
return u;
}
@@ -490,6 +450,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
uprobe->inode = igrab(inode);
uprobe->offset = offset;
init_rwsem(&uprobe->consumer_rwsem);
+ mutex_init(&uprobe->copy_mutex);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
@@ -510,7 +471,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
- if (!(uprobe->flags & UPROBE_RUN_HANDLER))
+ if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
return;
down_read(&uprobe->consumer_rwsem);
@@ -616,29 +577,43 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp)
return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
}
-/*
- * How mm->uprobes_state.count gets updated
- * uprobe_mmap() increments the count if
- * - it successfully adds a breakpoint.
- * - it cannot add a breakpoint, but sees that there is a underlying
- * breakpoint (via a is_swbp_at_addr()).
- *
- * uprobe_munmap() decrements the count if
- * - it sees a underlying breakpoint, (via is_swbp_at_addr)
- * (Subsequent uprobe_unregister wouldnt find the breakpoint
- * unless a uprobe_mmap kicks in, since the old vma would be
- * dropped just after uprobe_munmap.)
- *
- * uprobe_register increments the count if:
- * - it successfully adds a breakpoint.
- *
- * uprobe_unregister decrements the count if:
- * - it sees a underlying breakpoint and removes successfully.
- * (via is_swbp_at_addr)
- * (Subsequent uprobe_munmap wouldnt find the breakpoint
- * since there is no underlying breakpoint after the
- * breakpoint removal.)
- */
+static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
+ struct mm_struct *mm, unsigned long vaddr)
+{
+ int ret = 0;
+
+ if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
+ return ret;
+
+ mutex_lock(&uprobe->copy_mutex);
+ if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
+ goto out;
+
+ ret = copy_insn(uprobe, file);
+ if (ret)
+ goto out;
+
+ ret = -ENOTSUPP;
+ if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
+ goto out;
+
+ ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
+ if (ret)
+ goto out;
+
+ /* write_opcode() assumes we don't cross page boundary */
+ BUG_ON((uprobe->offset & ~PAGE_MASK) +
+ UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
+
+ smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
+ set_bit(UPROBE_COPY_INSN, &uprobe->flags);
+
+ out:
+ mutex_unlock(&uprobe->copy_mutex);
+
+ return ret;
+}
+
static int
install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long vaddr)
@@ -656,24 +631,9 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
if (!uprobe->consumers)
return 0;
- if (!(uprobe->flags & UPROBE_COPY_INSN)) {
- ret = copy_insn(uprobe, vma->vm_file);
- if (ret)
- return ret;
-
- if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
- return -ENOTSUPP;
-
- ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
- if (ret)
- return ret;
-
- /* write_opcode() assumes we don't cross page boundary */
- BUG_ON((uprobe->offset & ~PAGE_MASK) +
- UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-
- uprobe->flags |= UPROBE_COPY_INSN;
- }
+ ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
+ if (ret)
+ return ret;
/*
* set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
@@ -692,15 +652,15 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
return ret;
}
-static void
+static int
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
/* can happen if uprobe_register() fails */
if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
- return;
+ return 0;
set_bit(MMF_RECALC_UPROBES, &mm->flags);
- set_orig_insn(&uprobe->arch, mm, vaddr);
+ return set_orig_insn(&uprobe->arch, mm, vaddr);
}
/*
@@ -735,7 +695,6 @@ static struct map_info *
build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
{
unsigned long pgoff = offset >> PAGE_SHIFT;
- struct prio_tree_iter iter;
struct vm_area_struct *vma;
struct map_info *curr = NULL;
struct map_info *prev = NULL;
@@ -744,7 +703,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
again:
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (!valid_vma(vma, is_register))
continue;
@@ -816,7 +775,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
struct mm_struct *mm = info->mm;
struct vm_area_struct *vma;
- if (err)
+ if (err && is_register)
goto free;
down_write(&mm->mmap_sem);
@@ -832,7 +791,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
if (is_register)
err = install_breakpoint(uprobe, mm, vma, info->vaddr);
else
- remove_breakpoint(uprobe, mm, info->vaddr);
+ err |= remove_breakpoint(uprobe, mm, info->vaddr);
unlock:
up_write(&mm->mmap_sem);
@@ -889,13 +848,15 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
mutex_lock(uprobes_hash(inode));
uprobe = alloc_uprobe(inode, offset);
- if (uprobe && !consumer_add(uprobe, uc)) {
+ if (!uprobe) {
+ ret = -ENOMEM;
+ } else if (!consumer_add(uprobe, uc)) {
ret = __uprobe_register(uprobe);
if (ret) {
uprobe->consumers = NULL;
__uprobe_unregister(uprobe);
} else {
- uprobe->flags |= UPROBE_RUN_HANDLER;
+ set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
}
}
@@ -928,7 +889,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
if (consumer_del(uprobe, uc)) {
if (!uprobe->consumers) {
__uprobe_unregister(uprobe);
- uprobe->flags &= ~UPROBE_RUN_HANDLER;
+ clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
}
}
@@ -1389,10 +1350,11 @@ bool uprobe_deny_signal(void)
*/
static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
{
- if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
- return true;
-
- uprobe->flags &= ~UPROBE_SKIP_SSTEP;
+ if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
+ if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
+ return true;
+ clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
+ }
return false;
}
@@ -1415,6 +1377,30 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
clear_bit(MMF_HAS_UPROBES, &mm->flags);
}
+static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
+{
+ struct page *page;
+ uprobe_opcode_t opcode;
+ int result;
+
+ pagefault_disable();
+ result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
+ sizeof(opcode));
+ pagefault_enable();
+
+ if (likely(result == 0))
+ goto out;
+
+ result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+ if (result < 0)
+ return result;
+
+ copy_opcode(page, vaddr, &opcode);
+ put_page(page);
+ out:
+ return is_swbp_insn(&opcode);
+}
+
static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
{
struct mm_struct *mm = current->mm;
@@ -1485,38 +1471,41 @@ static void handle_swbp(struct pt_regs *regs)
}
return;
}
+ /*
+ * TODO: move copy_insn/etc into _register and remove this hack.
+ * After we hit the bp, _unregister + _register can install the
+ * new and not-yet-analyzed uprobe at the same address, restart.
+ */
+ smp_rmb(); /* pairs with wmb() in install_breakpoint() */
+ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
+ goto restart;
utask = current->utask;
if (!utask) {
utask = add_utask();
/* Cannot allocate; re-execute the instruction. */
if (!utask)
- goto cleanup_ret;
+ goto restart;
}
- utask->active_uprobe = uprobe;
+
handler_chain(uprobe, regs);
- if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs))
- goto cleanup_ret;
+ if (can_skip_sstep(uprobe, regs))
+ goto out;
- utask->state = UTASK_SSTEP;
if (!pre_ssout(uprobe, regs, bp_vaddr)) {
arch_uprobe_enable_step(&uprobe->arch);
+ utask->active_uprobe = uprobe;
+ utask->state = UTASK_SSTEP;
return;
}
-cleanup_ret:
- if (utask) {
- utask->active_uprobe = NULL;
- utask->state = UTASK_RUNNING;
- }
- if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
-
- /*
- * cannot singlestep; cannot skip instruction;
- * re-execute the instruction.
- */
- instruction_pointer_set(regs, bp_vaddr);
-
+restart:
+ /*
+ * cannot singlestep; cannot skip instruction;
+ * re-execute the instruction.
+ */
+ instruction_pointer_set(regs, bp_vaddr);
+out:
put_uprobe(uprobe);
}
@@ -1548,13 +1537,12 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
}
/*
- * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on
- * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and
- * allows the thread to return from interrupt.
+ * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
+ * allows the thread to return from interrupt. After that handle_swbp()
+ * sets utask->active_uprobe.
*
- * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and
- * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from
- * interrupt.
+ * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
+ * and allows the thread to return from interrupt.
*
* While returning to userspace, thread notices the TIF_UPROBE flag and calls
* uprobe_notify_resume().
@@ -1563,11 +1551,13 @@ void uprobe_notify_resume(struct pt_regs *regs)
{
struct uprobe_task *utask;
+ clear_thread_flag(TIF_UPROBE);
+
utask = current->utask;
- if (!utask || utask->state == UTASK_BP_HIT)
- handle_swbp(regs);
- else
+ if (utask && utask->active_uprobe)
handle_singlestep(utask, regs);
+ else
+ handle_swbp(regs);
}
/*
@@ -1576,17 +1566,10 @@ void uprobe_notify_resume(struct pt_regs *regs)
*/
int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
- struct uprobe_task *utask;
-
if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
return 0;
- utask = current->utask;
- if (utask)
- utask->state = UTASK_BP_HIT;
-
set_thread_flag(TIF_UPROBE);
-
return 1;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index f65345f9e5bb..346616c0092c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -457,108 +457,13 @@ void daemonize(const char *name, ...)
/* Become as one with the init task */
daemonize_fs_struct();
- exit_files(current);
- current->files = init_task.files;
- atomic_inc(&current->files->count);
+ daemonize_descriptors();
reparent_to_kthreadd();
}
EXPORT_SYMBOL(daemonize);
-static void close_files(struct files_struct * files)
-{
- int i, j;
- struct fdtable *fdt;
-
- j = 0;
-
- /*
- * It is safe to dereference the fd table without RCU or
- * ->file_lock because this is the last reference to the
- * files structure. But use RCU to shut RCU-lockdep up.
- */
- rcu_read_lock();
- fdt = files_fdtable(files);
- rcu_read_unlock();
- for (;;) {
- unsigned long set;
- i = j * BITS_PER_LONG;
- if (i >= fdt->max_fds)
- break;
- set = fdt->open_fds[j++];
- while (set) {
- if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
- if (file) {
- filp_close(file, files);
- cond_resched();
- }
- }
- i++;
- set >>= 1;
- }
- }
-}
-
-struct files_struct *get_files_struct(struct task_struct *task)
-{
- struct files_struct *files;
-
- task_lock(task);
- files = task->files;
- if (files)
- atomic_inc(&files->count);
- task_unlock(task);
-
- return files;
-}
-
-void put_files_struct(struct files_struct *files)
-{
- struct fdtable *fdt;
-
- if (atomic_dec_and_test(&files->count)) {
- close_files(files);
- /*
- * Free the fd and fdset arrays if we expanded them.
- * If the fdtable was embedded, pass files for freeing
- * at the end of the RCU grace period. Otherwise,
- * you can free files immediately.
- */
- rcu_read_lock();
- fdt = files_fdtable(files);
- if (fdt != &files->fdtab)
- kmem_cache_free(files_cachep, files);
- free_fdtable(fdt);
- rcu_read_unlock();
- }
-}
-
-void reset_files_struct(struct files_struct *files)
-{
- struct task_struct *tsk = current;
- struct files_struct *old;
-
- old = tsk->files;
- task_lock(tsk);
- tsk->files = files;
- task_unlock(tsk);
- put_files_struct(old);
-}
-
-void exit_files(struct task_struct *tsk)
-{
- struct files_struct * files = tsk->files;
-
- if (files) {
- task_lock(tsk);
- tsk->files = NULL;
- task_unlock(tsk);
- put_files_struct(files);
- }
-}
-
#ifdef CONFIG_MM_OWNER
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
@@ -1046,6 +951,9 @@ void do_exit(long code)
if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe);
+ if (tsk->task_frag.page)
+ put_page(tsk->task_frag.page);
+
validate_creds_for_do_exit(tsk);
preempt_disable();
diff --git a/kernel/fork.c b/kernel/fork.c
index 5a0e74d89a5a..8b20ab7d3aa2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -330,6 +330,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
tsk->btrace_seq = 0;
#endif
tsk->splice_pipe = NULL;
+ tsk->task_frag.page = NULL;
account_kernel_stack(ti, 1);
@@ -422,7 +423,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mapping->i_mmap_writable++;
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
- vma_prio_tree_add(tmp, mpnt);
+ if (unlikely(tmp->vm_flags & VM_NONLINEAR))
+ vma_nonlinear_insert(tmp,
+ &mapping->i_mmap_nonlinear);
+ else
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
mutex_unlock(&mapping->i_mmap_mutex);
}
@@ -621,26 +627,6 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
- mm->num_exe_file_vmas++;
-}
-
-void removed_exe_file_vma(struct mm_struct *mm)
-{
- mm->num_exe_file_vmas--;
- if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
- fput(mm->exe_file);
- mm->exe_file = NULL;
- }
-
-}
-
void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
if (new_exe_file)
@@ -648,15 +634,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
if (mm->exe_file)
fput(mm->exe_file);
mm->exe_file = new_exe_file;
- mm->num_exe_file_vmas = 0;
}
struct file *get_mm_exe_file(struct mm_struct *mm)
{
struct file *exe_file;
- /* We need mmap_sem to protect against races with removal of
- * VM_EXECUTABLE vmas */
+ /* We need mmap_sem to protect against races with removal of exe_file */
down_read(&mm->mmap_sem);
exe_file = mm->exe_file;
if (exe_file)
@@ -1077,7 +1061,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
init_rwsem(&sig->group_rwsem);
#endif
- sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1601,7 +1584,7 @@ long do_fork(unsigned long clone_flags,
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
- if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+ if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1651,6 +1634,17 @@ long do_fork(unsigned long clone_flags,
return nr;
}
+#ifdef CONFIG_GENERIC_KERNEL_THREAD
+/*
+ * Create a kernel thread.
+ */
+pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+ return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL,
+ (unsigned long)arg, NULL, NULL);
+}
+#endif
+
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 49a77727db42..4e69e24d3d7d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -148,7 +148,8 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
* @host_data: Controller private data pointer
*
* Allocates a legacy irq_domain if irq_base is positive or a linear
- * domain otherwise.
+ * domain otherwise. For the legacy domain, IRQ descriptors will also
+ * be allocated.
*
* This is intended to implement the expected behaviour for most
* interrupt controllers which is that a linear mapping should
@@ -162,11 +163,33 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
const struct irq_domain_ops *ops,
void *host_data)
{
- if (first_irq > 0)
- return irq_domain_add_legacy(of_node, size, first_irq, 0,
+ if (first_irq > 0) {
+ int irq_base;
+
+ if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
+ /*
+ * Set the descriptor allocator to search for a
+ * 1-to-1 mapping, such as irq_alloc_desc_at().
+ * Use of_node_to_nid() which is defined to
+ * numa_node_id() on platforms that have no custom
+ * implementation.
+ */
+ irq_base = irq_alloc_descs(first_irq, first_irq, size,
+ of_node_to_nid(of_node));
+ if (irq_base < 0) {
+ WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
+ first_irq);
+ irq_base = first_irq;
+ }
+ } else
+ irq_base = first_irq;
+
+ return irq_domain_add_legacy(of_node, size, irq_base, 0,
ops, host_data);
- else
- return irq_domain_add_linear(of_node, size, ops, host_data);
+ }
+
+ /* A linear domain is the default */
+ return irq_domain_add_linear(of_node, size, ops, host_data);
}
/**
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 43049192b5ec..60f48fa0fd0d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key,
key->timeout = rl;
INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
+EXPORT_SYMBOL_GPL(jump_label_rate_limit);
static int addr_conflict(struct jump_entry *entry, void *start, void *end)
{
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 0668d58d6413..5e4bd7864c5d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,7 +21,6 @@
#include <linux/hardirq.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
-#include <generated/utsrelease.h>
#include <linux/utsname.h>
#include <linux/numa.h>
#include <linux/suspend.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6f99aead66c6..1c317e386831 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -37,6 +37,7 @@
#include <linux/notifier.h>
#include <linux/suspend.h>
#include <linux/rwsem.h>
+#include <linux/ptrace.h>
#include <asm/uaccess.h>
#include <trace/events/module.h>
@@ -221,11 +222,13 @@ static int ____call_usermodehelper(void *data)
retval = kernel_execve(sub_info->path,
(const char *const *)sub_info->argv,
(const char *const *)sub_info->envp);
+ if (!retval)
+ return 0;
/* Exec failed? */
fail:
sub_info->retval = retval;
- return 0;
+ do_exit(0);
}
static int call_helper(void *data)
@@ -292,7 +295,7 @@ static int wait_for_helper(void *data)
}
umh_complete(sub_info);
- return 0;
+ do_exit(0);
}
/* This is run by khelper thread */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 146a6fa96825..29fb60caecb5 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,6 +16,7 @@
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/freezer.h>
+#include <linux/ptrace.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
new file mode 100644
index 000000000000..4646eb2c3820
--- /dev/null
+++ b/kernel/modsign_pubkey.c
@@ -0,0 +1,113 @@
+/* Public keys for module signature verification
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/cred.h>
+#include <linux/err.h>
+#include <keys/asymmetric-type.h>
+#include "module-internal.h"
+
+struct key *modsign_keyring;
+
+extern __initdata const u8 modsign_certificate_list[];
+extern __initdata const u8 modsign_certificate_list_end[];
+asm(".section .init.data,\"aw\"\n"
+ "modsign_certificate_list:\n"
+ ".incbin \"signing_key.x509\"\n"
+ ".incbin \"extra_certificates\"\n"
+ "modsign_certificate_list_end:"
+ );
+
+/*
+ * We need to make sure ccache doesn't cache the .o file as it doesn't notice
+ * if modsign.pub changes.
+ */
+static __initdata const char annoy_ccache[] = __TIME__ "foo";
+
+/*
+ * Load the compiled-in keys
+ */
+static __init int module_verify_init(void)
+{
+ pr_notice("Initialise module verification\n");
+
+ modsign_keyring = key_alloc(&key_type_keyring, ".module_sign",
+ KUIDT_INIT(0), KGIDT_INIT(0),
+ current_cred(),
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ if (IS_ERR(modsign_keyring))
+ panic("Can't allocate module signing keyring\n");
+
+ if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0)
+ panic("Can't instantiate module signing keyring\n");
+
+ return 0;
+}
+
+/*
+ * Must be initialised before we try and load the keys into the keyring.
+ */
+device_initcall(module_verify_init);
+
+/*
+ * Load the compiled-in keys
+ */
+static __init int load_module_signing_keys(void)
+{
+ key_ref_t key;
+ const u8 *p, *end;
+ size_t plen;
+
+ pr_notice("Loading module verification certificates\n");
+
+ end = modsign_certificate_list_end;
+ p = modsign_certificate_list;
+ while (p < end) {
+ /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
+ * than 256 bytes in size.
+ */
+ if (end - p < 4)
+ goto dodgy_cert;
+ if (p[0] != 0x30 &&
+ p[1] != 0x82)
+ goto dodgy_cert;
+ plen = (p[2] << 8) | p[3];
+ plen += 4;
+ if (plen > end - p)
+ goto dodgy_cert;
+
+ key = key_create_or_update(make_key_ref(modsign_keyring, 1),
+ "asymmetric",
+ NULL,
+ p,
+ plen,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ if (IS_ERR(key))
+ pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n",
+ PTR_ERR(key));
+ else
+ pr_notice("MODSIGN: Loaded cert '%s'\n",
+ key_ref_to_ptr(key)->description);
+ p += plen;
+ }
+
+ return 0;
+
+dodgy_cert:
+ pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n");
+ return 0;
+}
+late_initcall(load_module_signing_keys);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
new file mode 100644
index 000000000000..24f9247b7d02
--- /dev/null
+++ b/kernel/module-internal.h
@@ -0,0 +1,14 @@
+/* Module internals
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+extern struct key *modsign_keyring;
+
+extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
index 4edbd9c11aca..6085f5ef88ea 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -58,6 +58,8 @@
#include <linux/jump_label.h>
#include <linux/pfn.h>
#include <linux/bsearch.h>
+#include <linux/fips.h>
+#include "module-internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/module.h>
@@ -102,6 +104,43 @@ static LIST_HEAD(modules);
struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
#endif /* CONFIG_KGDB_KDB */
+#ifdef CONFIG_MODULE_SIG
+#ifdef CONFIG_MODULE_SIG_FORCE
+static bool sig_enforce = true;
+#else
+static bool sig_enforce = false;
+
+static int param_set_bool_enable_only(const char *val,
+ const struct kernel_param *kp)
+{
+ int err;
+ bool test;
+ struct kernel_param dummy_kp = *kp;
+
+ dummy_kp.arg = &test;
+
+ err = param_set_bool(val, &dummy_kp);
+ if (err)
+ return err;
+
+ /* Don't let them unset it once it's set! */
+ if (!test && sig_enforce)
+ return -EROFS;
+
+ if (test)
+ sig_enforce = true;
+ return 0;
+}
+
+static const struct kernel_param_ops param_ops_bool_enable_only = {
+ .set = param_set_bool_enable_only,
+ .get = param_get_bool,
+};
+#define param_check_bool_enable_only param_check_bool
+
+module_param(sig_enforce, bool_enable_only, 0644);
+#endif /* !CONFIG_MODULE_SIG_FORCE */
+#endif /* CONFIG_MODULE_SIG */
/* Block module loading/unloading? */
int modules_disabled = 0;
@@ -136,6 +175,7 @@ struct load_info {
unsigned long symoffs, stroffs;
struct _ddebug *debug;
unsigned int num_debug;
+ bool sig_ok;
struct {
unsigned int sym, str, mod, vers, info, pcpu;
} index;
@@ -1949,26 +1989,6 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
return ret;
}
-int __weak apply_relocate(Elf_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- pr_err("module %s: REL relocation unsupported\n", me->name);
- return -ENOEXEC;
-}
-
-int __weak apply_relocate_add(Elf_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- pr_err("module %s: RELA relocation unsupported\n", me->name);
- return -ENOEXEC;
-}
-
static int apply_relocations(struct module *mod, const struct load_info *info)
{
unsigned int i;
@@ -2399,7 +2419,44 @@ static inline void kmemleak_load_module(const struct module *mod,
}
#endif
-/* Sets info->hdr and info->len. */
+#ifdef CONFIG_MODULE_SIG
+static int module_sig_check(struct load_info *info,
+ const void *mod, unsigned long *_len)
+{
+ int err = -ENOKEY;
+ unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ unsigned long len = *_len;
+
+ if (len > markerlen &&
+ memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+ /* We truncate the module to discard the signature */
+ *_len -= markerlen;
+ err = mod_verify_sig(mod, _len);
+ }
+
+ if (!err) {
+ info->sig_ok = true;
+ return 0;
+ }
+
+ /* Not having a signature is only an error if we're strict. */
+ if (err < 0 && fips_enabled)
+ panic("Module verification failed with error %d in FIPS mode\n",
+ err);
+ if (err == -ENOKEY && !sig_enforce)
+ err = 0;
+
+ return err;
+}
+#else /* !CONFIG_MODULE_SIG */
+static int module_sig_check(struct load_info *info,
+ void *mod, unsigned long *len)
+{
+ return 0;
+}
+#endif /* !CONFIG_MODULE_SIG */
+
+/* Sets info->hdr, info->len and info->sig_ok. */
static int copy_and_check(struct load_info *info,
const void __user *umod, unsigned long len,
const char __user *uargs)
@@ -2419,6 +2476,10 @@ static int copy_and_check(struct load_info *info,
goto free_hdr;
}
+ err = module_sig_check(info, hdr, &len);
+ if (err)
+ goto free_hdr;
+
/* Sanity checks against insmoding binaries or wrong arch,
weird elf version */
if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
@@ -2730,6 +2791,10 @@ static int check_module_license_and_versions(struct module *mod)
if (strcmp(mod->name, "driverloader") == 0)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ /* lve claims to be GPL but upstream won't provide source */
+ if (strcmp(mod->name, "lve") == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
|| (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2861,6 +2926,20 @@ static int post_relocation(struct module *mod, const struct load_info *info)
return module_finalize(info->hdr, info->sechdrs, mod);
}
+/* Is this module of this name done loading? No locks held. */
+static bool finished_loading(const char *name)
+{
+ struct module *mod;
+ bool ret;
+
+ mutex_lock(&module_mutex);
+ mod = find_module(name);
+ ret = !mod || mod->state != MODULE_STATE_COMING;
+ mutex_unlock(&module_mutex);
+
+ return ret;
+}
+
/* Allocate and load the module: note that size of section 0 is always
zero, and we rely on this for optional sections. */
static struct module *load_module(void __user *umod,
@@ -2868,7 +2947,7 @@ static struct module *load_module(void __user *umod,
const char __user *uargs)
{
struct load_info info = { NULL, };
- struct module *mod;
+ struct module *mod, *old;
long err;
pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2886,6 +2965,12 @@ static struct module *load_module(void __user *umod,
goto free_copy;
}
+#ifdef CONFIG_MODULE_SIG
+ mod->sig_ok = info.sig_ok;
+ if (!mod->sig_ok)
+ add_taint_module(mod, TAINT_FORCED_MODULE);
+#endif
+
/* Now module is in final location, initialize linked lists, etc. */
err = module_unload_init(mod);
if (err)
@@ -2934,8 +3019,18 @@ static struct module *load_module(void __user *umod,
* function to insert in a way safe to concurrent readers.
* The mutex protects against concurrent writers.
*/
+again:
mutex_lock(&module_mutex);
- if (find_module(mod->name)) {
+ if ((old = find_module(mod->name)) != NULL) {
+ if (old->state == MODULE_STATE_COMING) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(mod->name));
+ if (err)
+ goto free_arch_cleanup;
+ goto again;
+ }
err = -EEXIST;
goto unlock;
}
@@ -2975,7 +3070,7 @@ static struct module *load_module(void __user *umod,
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
module_bug_cleanup(mod);
-
+ wake_up_all(&module_wq);
ddebug:
dynamic_debug_remove(info.debug);
unlock:
@@ -3050,7 +3145,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
free_module(mod);
- wake_up(&module_wq);
+ wake_up_all(&module_wq);
return ret;
}
if (ret > 0) {
@@ -3062,9 +3157,8 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
dump_stack();
}
- /* Now it's a first class citizen! Wake up anyone waiting for it. */
+ /* Now it's a first class citizen! */
mod->state = MODULE_STATE_LIVE;
- wake_up(&module_wq);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_LIVE, mod);
@@ -3087,6 +3181,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
mod->init_ro_size = 0;
mod->init_text_size = 0;
mutex_unlock(&module_mutex);
+ wake_up_all(&module_wq);
return 0;
}
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
new file mode 100644
index 000000000000..d492a23df99c
--- /dev/null
+++ b/kernel/module_signing.c
@@ -0,0 +1,249 @@
+/* Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <crypto/public_key.h>
+#include <crypto/hash.h>
+#include <keys/asymmetric-type.h>
+#include "module-internal.h"
+
+/*
+ * Module signature information block.
+ *
+ * The constituents of the signature section are, in order:
+ *
+ * - Signer's name
+ * - Key identifier
+ * - Signature data
+ * - Information block
+ */
+struct module_signature {
+ enum pkey_algo algo : 8; /* Public-key crypto algorithm */
+ enum pkey_hash_algo hash : 8; /* Digest algorithm */
+ enum pkey_id_type id_type : 8; /* Key identifier type */
+ u8 signer_len; /* Length of signer's name */
+ u8 key_id_len; /* Length of key identifier */
+ u8 __pad[3];
+ __be32 sig_len; /* Length of signature data */
+};
+
+/*
+ * Digest the module contents.
+ */
+static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
+ const void *mod,
+ unsigned long modlen)
+{
+ struct public_key_signature *pks;
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ size_t digest_size, desc_size;
+ int ret;
+
+ pr_devel("==>%s()\n", __func__);
+
+ /* Allocate the hashing algorithm we're going to need and find out how
+ * big the hash operational data will be.
+ */
+ tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0);
+ if (IS_ERR(tfm))
+ return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ digest_size = crypto_shash_digestsize(tfm);
+
+ /* We allocate the hash operational data storage on the end of our
+ * context data and the digest output buffer on the end of that.
+ */
+ ret = -ENOMEM;
+ pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
+ if (!pks)
+ goto error_no_pks;
+
+ pks->pkey_hash_algo = hash;
+ pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
+ pks->digest_size = digest_size;
+
+ desc = (void *)pks + sizeof(*pks);
+ desc->tfm = tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto error;
+
+ ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
+ if (ret < 0)
+ goto error;
+
+ crypto_free_shash(tfm);
+ pr_devel("<==%s() = ok\n", __func__);
+ return pks;
+
+error:
+ kfree(pks);
+error_no_pks:
+ crypto_free_shash(tfm);
+ pr_devel("<==%s() = %d\n", __func__, ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Extract an MPI array from the signature data. This represents the actual
+ * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the
+ * size of the MPI in bytes.
+ *
+ * RSA signatures only have one MPI, so currently we only read one.
+ */
+static int mod_extract_mpi_array(struct public_key_signature *pks,
+ const void *data, size_t len)
+{
+ size_t nbytes;
+ MPI mpi;
+
+ if (len < 3)
+ return -EBADMSG;
+ nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
+ data += 2;
+ len -= 2;
+ if (len != nbytes)
+ return -EBADMSG;
+
+ mpi = mpi_read_raw_data(data, nbytes);
+ if (!mpi)
+ return -ENOMEM;
+ pks->mpi[0] = mpi;
+ pks->nr_mpi = 1;
+ return 0;
+}
+
+/*
+ * Request an asymmetric key.
+ */
+static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
+ const u8 *key_id, size_t key_id_len)
+{
+ key_ref_t key;
+ size_t i;
+ char *id, *q;
+
+ pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
+
+ /* Construct an identifier. */
+ id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
+ if (!id)
+ return ERR_PTR(-ENOKEY);
+
+ memcpy(id, signer, signer_len);
+
+ q = id + signer_len;
+ *q++ = ':';
+ *q++ = ' ';
+ for (i = 0; i < key_id_len; i++) {
+ *q++ = hex_asc[*key_id >> 4];
+ *q++ = hex_asc[*key_id++ & 0x0f];
+ }
+
+ *q = 0;
+
+ pr_debug("Look up: \"%s\"\n", id);
+
+ key = keyring_search(make_key_ref(modsign_keyring, 1),
+ &key_type_asymmetric, id);
+ if (IS_ERR(key))
+ pr_warn("Request for unknown module key '%s' err %ld\n",
+ id, PTR_ERR(key));
+ kfree(id);
+
+ if (IS_ERR(key)) {
+ switch (PTR_ERR(key)) {
+ /* Hide some search errors */
+ case -EACCES:
+ case -ENOTDIR:
+ case -EAGAIN:
+ return ERR_PTR(-ENOKEY);
+ default:
+ return ERR_CAST(key);
+ }
+ }
+
+ pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
+ return key_ref_to_ptr(key);
+}
+
+/*
+ * Verify the signature on a module.
+ */
+int mod_verify_sig(const void *mod, unsigned long *_modlen)
+{
+ struct public_key_signature *pks;
+ struct module_signature ms;
+ struct key *key;
+ const void *sig;
+ size_t modlen = *_modlen, sig_len;
+ int ret;
+
+ pr_devel("==>%s(,%lu)\n", __func__, modlen);
+
+ if (modlen <= sizeof(ms))
+ return -EBADMSG;
+
+ memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
+ modlen -= sizeof(ms);
+
+ sig_len = be32_to_cpu(ms.sig_len);
+ if (sig_len >= modlen)
+ return -EBADMSG;
+ modlen -= sig_len;
+ if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
+ return -EBADMSG;
+ modlen -= (size_t)ms.signer_len + ms.key_id_len;
+
+ *_modlen = modlen;
+ sig = mod + modlen;
+
+ /* For the moment, only support RSA and X.509 identifiers */
+ if (ms.algo != PKEY_ALGO_RSA ||
+ ms.id_type != PKEY_ID_X509)
+ return -ENOPKG;
+
+ if (ms.hash >= PKEY_HASH__LAST ||
+ !pkey_hash_algo[ms.hash])
+ return -ENOPKG;
+
+ key = request_asymmetric_key(sig, ms.signer_len,
+ sig + ms.signer_len, ms.key_id_len);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ pks = mod_make_digest(ms.hash, mod, modlen);
+ if (IS_ERR(pks)) {
+ ret = PTR_ERR(pks);
+ goto error_put_key;
+ }
+
+ ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
+ sig_len);
+ if (ret < 0)
+ goto error_free_pks;
+
+ ret = verify_signature(key, pks);
+ pr_devel("verify_signature() = %d\n", ret);
+
+error_free_pks:
+ mpi_free(pks->rsa.s);
+ kfree(pks);
+error_put_key:
+ key_put(key);
+ pr_devel("<==%s() = %d\n", __func__, ret);
+ return ret;
+}
diff --git a/kernel/pid.c b/kernel/pid.c
index e86b291ad834..aebd4f5aaf41 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -479,6 +479,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
}
return nr;
}
+EXPORT_SYMBOL_GPL(pid_nr_ns);
pid_t pid_vnr(struct pid *pid)
{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6144bab8fd8e..eb00be205811 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/reboot.h>
+#include <linux/export.h>
#define BITS_PER_PAGE (PAGE_SIZE*8)
@@ -132,18 +133,26 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
return create_pid_namespace(old_ns);
}
-void free_pid_ns(struct kref *kref)
+static void free_pid_ns(struct kref *kref)
{
- struct pid_namespace *ns, *parent;
+ struct pid_namespace *ns;
ns = container_of(kref, struct pid_namespace, kref);
-
- parent = ns->parent;
destroy_pid_namespace(ns);
+}
- if (parent != NULL)
- put_pid_ns(parent);
+void put_pid_ns(struct pid_namespace *ns)
+{
+ struct pid_namespace *parent;
+
+ while (ns != &init_pid_ns) {
+ parent = ns->parent;
+ if (!kref_put(&ns->kref, free_pid_ns))
+ break;
+ ns = parent;
+ }
}
+EXPORT_SYMBOL_GPL(put_pid_ns);
void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a70518c9d82f..5dfdc9ea180b 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -263,6 +263,10 @@ config PM_GENERIC_DOMAINS
bool
depends on PM
+config PM_GENERIC_DOMAINS_SLEEP
+ def_bool y
+ depends on PM_SLEEP && PM_GENERIC_DOMAINS
+
config PM_GENERIC_DOMAINS_RUNTIME
def_bool y
depends on PM_RUNTIME && PM_GENERIC_DOMAINS
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index d52359374e85..68197a4e8fc9 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -37,7 +37,7 @@ static struct sysrq_key_op sysrq_poweroff_op = {
.enable_mask = SYSRQ_ENABLE_BOOT,
};
-static int pm_sysrq_init(void)
+static int __init pm_sysrq_init(void)
{
register_sysrq_key('o', &sysrq_poweroff_op);
return 0;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 19db29f67558..87da817f9e13 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -79,7 +79,7 @@ static int try_to_freeze_tasks(bool user_only)
/*
* We need to retry, but first give the freezing tasks some
- * time to enter the regrigerator.
+ * time to enter the refrigerator.
*/
msleep(10);
}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 6a031e684026..846bd42c7ed1 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -139,6 +139,7 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
default:
/* runtime check for not using enum */
BUG();
+ return PM_QOS_DEFAULT_VALUE;
}
}
diff --git a/kernel/printk.c b/kernel/printk.c
index 66a2ea37b576..2d607f4d1797 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1890,7 +1890,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
switch (action) {
case CPU_ONLINE:
case CPU_DEAD:
- case CPU_DYING:
case CPU_DOWN_FAILED:
case CPU_UP_CANCELED:
console_lock();
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a232bb59d93f..1f5e55dda955 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -180,7 +180,8 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
return has_ns_capability(current, ns, CAP_SYS_PTRACE);
}
-int __ptrace_may_access(struct task_struct *task, unsigned int mode)
+/* Returns 0 on success, -errno on denial. */
+static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
const struct cred *cred = current_cred(), *tcred;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4fb2376ddf06..74df86bd9204 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -74,6 +74,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
.name = #sname, \
}
@@ -1197,7 +1198,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
raw_spin_unlock_irq(&rnp->lock);
/* Exclude any concurrent CPU-hotplug operations. */
- get_online_cpus();
+ mutex_lock(&rsp->onoff_mutex);
/*
* Set the quiescent-state-needed bits in all the rcu_node
@@ -1234,7 +1235,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
cond_resched();
}
- put_online_cpus();
+ mutex_unlock(&rsp->onoff_mutex);
return 1;
}
@@ -1700,6 +1701,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
/* Exclude any attempts to start a new grace period. */
+ mutex_lock(&rsp->onoff_mutex);
raw_spin_lock_irqsave(&rsp->onofflock, flags);
/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
@@ -1744,6 +1746,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
init_callback_list(rdp);
/* Disallow further callbacks on this CPU. */
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ mutex_unlock(&rsp->onoff_mutex);
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -2648,6 +2651,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
+ /* Exclude new grace periods. */
+ mutex_lock(&rsp->onoff_mutex);
+
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->beenonline = 1; /* We have now been online. */
@@ -2662,14 +2668,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
rcu_prepare_for_idle_init(cpu);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- /*
- * A new grace period might start here. If so, we won't be part
- * of it, but that is OK, as we are currently in a quiescent state.
- */
-
- /* Exclude any attempts to start a new GP on large systems. */
- raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
-
/* Add CPU to rcu_node bitmasks. */
rnp = rdp->mynode;
mask = rdp->grpmask;
@@ -2693,8 +2691,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
rnp = rnp->parent;
} while (rnp != NULL && !(rnp->qsmaskinit & mask));
+ local_irq_restore(flags);
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+ mutex_unlock(&rsp->onoff_mutex);
}
static void __cpuinit rcu_prepare_cpu(int cpu)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5faf05d68326..a240f032848e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -394,11 +394,17 @@ struct rcu_state {
struct rcu_head **orphan_donetail; /* Tail of above. */
long qlen_lazy; /* Number of lazy callbacks. */
long qlen; /* Total number of callbacks. */
+ /* End of fields guarded by onofflock. */
+
+ struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
+
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
unsigned long n_barrier_done; /* ++ at start and end of */
/* _rcu_barrier(). */
+ /* End of fields guarded by barrier_mutex. */
+
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
unsigned long n_force_qs; /* Number of calls to */
diff --git a/kernel/resource.c b/kernel/resource.c
index 34d45886ee84..73f35d4b30b9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct resource *root,
struct resource *parent = root;
struct resource *conflict;
struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
+ struct resource *next_res = NULL;
if (!res)
return;
@@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct resource *root,
res->end = end;
res->flags = IORESOURCE_BUSY;
- conflict = __request_resource(parent, res);
- if (!conflict)
- return;
+ while (1) {
- /* failed, split and try again */
- kfree(res);
+ conflict = __request_resource(parent, res);
+ if (!conflict) {
+ if (!next_res)
+ break;
+ res = next_res;
+ next_res = NULL;
+ continue;
+ }
- /* conflict covered whole area */
- if (conflict->start <= start && conflict->end >= end)
- return;
+ /* conflict covered whole area */
+ if (conflict->start <= res->start &&
+ conflict->end >= res->end) {
+ kfree(res);
+ WARN_ON(next_res);
+ break;
+ }
+
+ /* failed, split and try again */
+ if (conflict->start > res->start) {
+ end = res->end;
+ res->end = conflict->start - 1;
+ if (conflict->end < end) {
+ next_res = kzalloc(sizeof(*next_res),
+ GFP_ATOMIC);
+ if (!next_res) {
+ kfree(res);
+ break;
+ }
+ next_res->name = name;
+ next_res->start = conflict->end + 1;
+ next_res->end = end;
+ next_res->flags = IORESOURCE_BUSY;
+ }
+ } else {
+ res->start = conflict->end + 1;
+ }
+ }
- if (conflict->start > start)
- __reserve_region_with_split(root, start, conflict->start-1, name);
- if (conflict->end < end)
- __reserve_region_with_split(root, conflict->end+1, end, name);
}
void __init reserve_region_with_split(struct resource *root,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c17747236438..2d8927fda712 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -505,7 +505,7 @@ static inline void init_hrtick(void)
#ifdef CONFIG_SMP
#ifndef tsk_is_polling
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#define tsk_is_polling(t) 0
#endif
void resched_task(struct task_struct *p)
@@ -6122,6 +6122,17 @@ static void sched_init_numa(void)
* numbers.
*/
+ /*
+ * Here, we should temporarily reset sched_domains_numa_levels to 0.
+ * If it fails to allocate memory for array sched_domains_numa_masks[][],
+ * the array will contain less then 'level' members. This could be
+ * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+ * in other functions.
+ *
+ * We reset it to 'level' at the end of this function.
+ */
+ sched_domains_numa_levels = 0;
+
sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
if (!sched_domains_numa_masks)
return;
@@ -6176,11 +6187,68 @@ static void sched_init_numa(void)
}
sched_domain_topology = tl;
+
+ sched_domains_numa_levels = level;
+}
+
+static void sched_domains_numa_masks_set(int cpu)
+{
+ int i, j;
+ int node = cpu_to_node(cpu);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+ }
+}
+
+static void sched_domains_numa_masks_clear(int cpu)
+{
+ int i, j;
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++)
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+}
+
+/*
+ * Update sched_domains_numa_masks[level][node] array when new cpus
+ * are onlined.
+ */
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ sched_domains_numa_masks_set(cpu);
+ break;
+
+ case CPU_DEAD:
+ sched_domains_numa_masks_clear(cpu);
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
}
#else
static inline void sched_init_numa(void)
{
}
+
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ return 0;
+}
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -6629,6 +6697,7 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
put_online_cpus();
+ hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
diff --git a/kernel/signal.c b/kernel/signal.c
index 2c681f11b7d2..0af8868525d6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
+#include <linux/coredump.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ptrace.h>
@@ -2359,7 +2360,7 @@ relock:
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
- do_coredump(info->si_signo, info->si_signo, regs);
+ do_coredump(info, regs);
}
/*
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2095be3318d5..97c465ebd844 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -379,7 +379,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
rcu_batch_queue(&sp->batch_queue, head);
if (!sp->running) {
sp->running = true;
- queue_delayed_work(system_nrt_wq, &sp->work, 0);
+ schedule_delayed_work(&sp->work, 0);
}
spin_unlock_irqrestore(&sp->queue_lock, flags);
}
@@ -631,7 +631,7 @@ static void srcu_reschedule(struct srcu_struct *sp)
}
if (pending)
- queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL);
+ schedule_delayed_work(&sp->work, SRCU_INTERVAL);
}
/*
diff --git a/kernel/sys.c b/kernel/sys.c
index 241507f23eca..e6e0ece5f6a0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -368,6 +368,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
+ disable_nonboot_cpus();
if (!cmd)
printk(KERN_EMERG "Restarting system.\n");
else
@@ -1264,15 +1265,16 @@ DECLARE_RWSEM(uts_sem);
* Work around broken programs that cannot handle "Linux 3.0".
* Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
*/
-static int override_release(char __user *release, int len)
+static int override_release(char __user *release, size_t len)
{
int ret = 0;
- char buf[65];
if (current->personality & UNAME26) {
- char *rest = UTS_RELEASE;
+ const char *rest = UTS_RELEASE;
+ char buf[65] = { 0 };
int ndots = 0;
unsigned v;
+ size_t copy;
while (*rest) {
if (*rest == '.' && ++ndots >= 3)
@@ -1282,8 +1284,9 @@ static int override_release(char __user *release, int len)
rest++;
}
v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
- snprintf(buf, len, "2.6.%u%s", v, rest);
- ret = copy_to_user(release, buf, len);
+ copy = clamp_t(size_t, len, 1, sizeof(buf));
+ copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
+ ret = copy_to_user(release, buf, copy + 1);
}
return ret;
}
@@ -1788,15 +1791,15 @@ SYSCALL_DEFINE1(umask, int, mask)
#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
- struct file *exe_file;
+ struct fd exe;
struct dentry *dentry;
int err;
- exe_file = fget(fd);
- if (!exe_file)
+ exe = fdget(fd);
+ if (!exe.file)
return -EBADF;
- dentry = exe_file->f_path.dentry;
+ dentry = exe.file->f_path.dentry;
/*
* Because the original mm->exe_file points to executable file, make
@@ -1805,7 +1808,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
*/
err = -EACCES;
if (!S_ISREG(dentry->d_inode->i_mode) ||
- exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+ exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
goto exit;
err = inode_permission(dentry->d_inode, MAY_EXEC);
@@ -1839,12 +1842,12 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
goto exit_unlock;
err = 0;
- set_mm_exe_file(mm, exe_file);
+ set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
exit_unlock:
up_write(&mm->mmap_sem);
exit:
- fput(exe_file);
+ fdput(exe);
return err;
}
@@ -2204,7 +2207,7 @@ static int __orderly_poweroff(void)
return -ENOMEM;
}
- ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
+ ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
NULL, argv_cleanup, NULL);
if (ret == -ENOMEM)
argv_free(argv);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 81c7b1a1a307..26f65eaa01f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,10 +97,12 @@
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern int max_threads;
-extern int core_uses_pid;
extern int suid_dumpable;
+#ifdef CONFIG_COREDUMP
+extern int core_uses_pid;
extern char core_pattern[];
extern unsigned int core_pipe_limit;
+#endif
extern int pid_max;
extern int min_free_kbytes;
extern int pid_max_min, pid_max_max;
@@ -177,8 +179,10 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_COREDUMP
static int proc_dostring_coredump(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses it's own private copy */
@@ -404,6 +408,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_COREDUMP
{
.procname = "core_uses_pid",
.data = &core_uses_pid,
@@ -425,6 +430,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
#ifdef CONFIG_PROC_SYSCTL
{
.procname = "tainted",
@@ -1543,8 +1549,7 @@ static struct ctl_table fs_table[] = {
};
static struct ctl_table debug_table[] = {
-#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
- defined(CONFIG_S390) || defined(CONFIG_TILE)
+#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
{
.procname = "exception-trace",
.data = &show_unhandled_signals,
@@ -2036,12 +2041,14 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
static void validate_coredump_safety(void)
{
+#ifdef CONFIG_COREDUMP
if (suid_dumpable == SUID_DUMPABLE_SAFE &&
core_pattern[0] != '/' && core_pattern[0] != '|') {
printk(KERN_WARNING "Unsafe core_pattern used with "\
"suid_dumpable=2. Pipe handler or fully qualified "\
"core dump path required.\n");
}
+#endif
}
static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
@@ -2053,6 +2060,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
return error;
}
+#ifdef CONFIG_COREDUMP
static int proc_dostring_coredump(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2061,6 +2069,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
validate_coredump_safety();
return error;
}
+#endif
static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
void __user *buffer,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d0a32796550f..145bb4d3bd4d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -27,6 +27,7 @@
#include <linux/cgroup.h>
#include <linux/fs.h>
#include <linux/file.h>
+#include <linux/pid_namespace.h>
#include <net/genetlink.h>
#include <linux/atomic.h>
@@ -174,7 +175,9 @@ static void send_cpu_listeners(struct sk_buff *skb,
up_write(&listeners->sem);
}
-static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
+static void fill_stats(struct user_namespace *user_ns,
+ struct pid_namespace *pid_ns,
+ struct task_struct *tsk, struct taskstats *stats)
{
memset(stats, 0, sizeof(*stats));
/*
@@ -190,7 +193,7 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
stats->version = TASKSTATS_VERSION;
stats->nvcsw = tsk->nvcsw;
stats->nivcsw = tsk->nivcsw;
- bacct_add_tsk(stats, tsk);
+ bacct_add_tsk(user_ns, pid_ns, stats, tsk);
/* fill in extended acct fields */
xacct_add_tsk(stats, tsk);
@@ -207,7 +210,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
rcu_read_unlock();
if (!tsk)
return -ESRCH;
- fill_stats(tsk, stats);
+ fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats);
put_task_struct(tsk);
return 0;
}
@@ -291,6 +294,12 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
if (!cpumask_subset(mask, cpu_possible_mask))
return -EINVAL;
+ if (current_user_ns() != &init_user_ns)
+ return -EINVAL;
+
+ if (task_active_pid_ns(current) != &init_pid_ns)
+ return -EINVAL;
+
if (isadd == REGISTER) {
for_each_cpu(cpu, mask) {
s = kmalloc_node(sizeof(struct listener),
@@ -415,16 +424,15 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
struct nlattr *na;
size_t size;
u32 fd;
- struct file *file;
- int fput_needed;
+ struct fd f;
na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
if (!na)
return -EINVAL;
fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
- file = fget_light(fd, &fput_needed);
- if (!file)
+ f = fdget(fd);
+ if (!f.file)
return 0;
size = nla_total_size(sizeof(struct cgroupstats));
@@ -437,6 +445,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
sizeof(struct cgroupstats));
if (na == NULL) {
+ nlmsg_free(rep_skb);
rc = -EMSGSIZE;
goto err;
}
@@ -444,7 +453,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
stats = nla_data(na);
memset(stats, 0, sizeof(*stats));
- rc = cgroupstats_build(stats, file->f_dentry);
+ rc = cgroupstats_build(stats, f.file->f_dentry);
if (rc < 0) {
nlmsg_free(rep_skb);
goto err;
@@ -453,7 +462,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
rc = send_reply(rep_skb, info);
err:
- fput_light(file, fput_needed);
+ fdput(f);
return rc;
}
@@ -467,7 +476,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info)
rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
if (rc < 0)
goto out;
- rc = add_del_listener(info->snd_pid, mask, REGISTER);
+ rc = add_del_listener(info->snd_portid, mask, REGISTER);
out:
free_cpumask_var(mask);
return rc;
@@ -483,7 +492,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info)
rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
if (rc < 0)
goto out;
- rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
+ rc = add_del_listener(info->snd_portid, mask, DEREGISTER);
out:
free_cpumask_var(mask);
return rc;
@@ -631,11 +640,12 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
if (rc < 0)
return;
- stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID,
+ task_pid_nr_ns(tsk, &init_pid_ns));
if (!stats)
goto err;
- fill_stats(tsk, stats);
+ fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);
/*
* Doesn't matter if tsk is the leader or the last group member leaving
@@ -643,7 +653,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
if (!is_thread_group || !group_dead)
goto send;
- stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID,
+ task_tgid_nr_ns(tsk, &init_pid_ns));
if (!stats)
goto err;
diff --git a/kernel/time.c b/kernel/time.c
index ba744cf80696..d226c6a3fd28 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -30,7 +30,7 @@
#include <linux/export.h>
#include <linux/timex.h>
#include <linux/capability.h>
-#include <linux/clocksource.h>
+#include <linux/timekeeper_internal.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/security.h>
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fd42bd452b75..8601f0db1261 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -16,6 +16,10 @@ config ARCH_CLOCKSOURCE_DATA
config GENERIC_TIME_VSYSCALL
bool
+# Timekeeping vsyscall support
+config GENERIC_TIME_VSYSCALL_OLD
+ bool
+
# ktime_t scalar 64bit nsec representation
config KTIME_SCALAR
bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index aa27d391bfc8..f11d83b12949 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -37,7 +37,6 @@
static struct alarm_base {
spinlock_t lock;
struct timerqueue_head timerqueue;
- struct hrtimer timer;
ktime_t (*gettime)(void);
clockid_t base_clockid;
} alarm_bases[ALARM_NUMTYPE];
@@ -46,6 +45,8 @@ static struct alarm_base {
static ktime_t freezer_delta;
static DEFINE_SPINLOCK(freezer_delta_lock);
+static struct wakeup_source *ws;
+
#ifdef CONFIG_RTC_CLASS
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
@@ -130,50 +131,35 @@ static inline void alarmtimer_rtc_timer_init(void) { }
* @base: pointer to the base where the timer is being run
* @alarm: pointer to alarm being enqueued.
*
- * Adds alarm to a alarm_base timerqueue and if necessary sets
- * an hrtimer to run.
+ * Adds alarm to a alarm_base timerqueue
*
* Must hold base->lock when calling.
*/
static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
{
+ if (alarm->state & ALARMTIMER_STATE_ENQUEUED)
+ timerqueue_del(&base->timerqueue, &alarm->node);
+
timerqueue_add(&base->timerqueue, &alarm->node);
alarm->state |= ALARMTIMER_STATE_ENQUEUED;
-
- if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
- hrtimer_try_to_cancel(&base->timer);
- hrtimer_start(&base->timer, alarm->node.expires,
- HRTIMER_MODE_ABS);
- }
}
/**
- * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
+ * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue
* @base: pointer to the base where the timer is running
* @alarm: pointer to alarm being removed
*
- * Removes alarm to a alarm_base timerqueue and if necessary sets
- * a new timer to run.
+ * Removes alarm to a alarm_base timerqueue
*
* Must hold base->lock when calling.
*/
-static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
+static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
{
- struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
-
if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
return;
timerqueue_del(&base->timerqueue, &alarm->node);
alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
-
- if (next == &alarm->node) {
- hrtimer_try_to_cancel(&base->timer);
- next = timerqueue_getnext(&base->timerqueue);
- if (!next)
- return;
- hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
- }
}
@@ -188,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
*/
static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
{
- struct alarm_base *base = container_of(timer, struct alarm_base, timer);
- struct timerqueue_node *next;
+ struct alarm *alarm = container_of(timer, struct alarm, timer);
+ struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
- ktime_t now;
int ret = HRTIMER_NORESTART;
int restart = ALARMTIMER_NORESTART;
spin_lock_irqsave(&base->lock, flags);
- now = base->gettime();
- while ((next = timerqueue_getnext(&base->timerqueue))) {
- struct alarm *alarm;
- ktime_t expired = next->expires;
-
- if (expired.tv64 > now.tv64)
- break;
-
- alarm = container_of(next, struct alarm, node);
-
- timerqueue_del(&base->timerqueue, &alarm->node);
- alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
-
- alarm->state |= ALARMTIMER_STATE_CALLBACK;
- spin_unlock_irqrestore(&base->lock, flags);
- if (alarm->function)
- restart = alarm->function(alarm, now);
- spin_lock_irqsave(&base->lock, flags);
- alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
+ alarmtimer_dequeue(base, alarm);
+ spin_unlock_irqrestore(&base->lock, flags);
- if (restart != ALARMTIMER_NORESTART) {
- timerqueue_add(&base->timerqueue, &alarm->node);
- alarm->state |= ALARMTIMER_STATE_ENQUEUED;
- }
- }
+ if (alarm->function)
+ restart = alarm->function(alarm, base->gettime());
- if (next) {
- hrtimer_set_expires(&base->timer, next->expires);
+ spin_lock_irqsave(&base->lock, flags);
+ if (restart != ALARMTIMER_NORESTART) {
+ hrtimer_set_expires(&alarm->timer, alarm->node.expires);
+ alarmtimer_enqueue(base, alarm);
ret = HRTIMER_RESTART;
}
spin_unlock_irqrestore(&base->lock, flags);
@@ -250,6 +217,7 @@ static int alarmtimer_suspend(struct device *dev)
unsigned long flags;
struct rtc_device *rtc;
int i;
+ int ret;
spin_lock_irqsave(&freezer_delta_lock, flags);
min = freezer_delta;
@@ -279,8 +247,10 @@ static int alarmtimer_suspend(struct device *dev)
if (min.tv64 == 0)
return 0;
- /* XXX - Should we enforce a minimum sleep time? */
- WARN_ON(min.tv64 < NSEC_PER_SEC);
+ if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
+ __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+ return -EBUSY;
+ }
/* Setup an rtc timer to fire that far in the future */
rtc_timer_cancel(rtc, &rtctimer);
@@ -288,9 +258,11 @@ static int alarmtimer_suspend(struct device *dev)
now = rtc_tm_to_ktime(tm);
now = ktime_add(now, min);
- rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
-
- return 0;
+ /* Set alarm, if in the past reject suspend briefly to handle */
+ ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+ if (ret < 0)
+ __pm_wakeup_event(ws, MSEC_PER_SEC);
+ return ret;
}
#else
static int alarmtimer_suspend(struct device *dev)
@@ -324,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
{
timerqueue_init(&alarm->node);
+ hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
+ HRTIMER_MODE_ABS);
+ alarm->timer.function = alarmtimer_fired;
alarm->function = function;
alarm->type = type;
alarm->state = ALARMTIMER_STATE_INACTIVE;
@@ -334,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
* @alarm: ptr to alarm to set
* @start: time to run the alarm
*/
-void alarm_start(struct alarm *alarm, ktime_t start)
+int alarm_start(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
+ int ret;
spin_lock_irqsave(&base->lock, flags);
- if (alarmtimer_active(alarm))
- alarmtimer_remove(base, alarm);
alarm->node.expires = start;
alarmtimer_enqueue(base, alarm);
+ ret = hrtimer_start(&alarm->timer, alarm->node.expires,
+ HRTIMER_MODE_ABS);
spin_unlock_irqrestore(&base->lock, flags);
+ return ret;
}
/**
@@ -358,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
- int ret = -1;
- spin_lock_irqsave(&base->lock, flags);
-
- if (alarmtimer_callback_running(alarm))
- goto out;
+ int ret;
- if (alarmtimer_is_queued(alarm)) {
- alarmtimer_remove(base, alarm);
- ret = 1;
- } else
- ret = 0;
-out:
+ spin_lock_irqsave(&base->lock, flags);
+ ret = hrtimer_try_to_cancel(&alarm->timer);
+ if (ret >= 0)
+ alarmtimer_dequeue(base, alarm);
spin_unlock_irqrestore(&base->lock, flags);
return ret;
}
@@ -802,10 +773,6 @@ static int __init alarmtimer_init(void)
for (i = 0; i < ALARM_NUMTYPE; i++) {
timerqueue_init_head(&alarm_bases[i].timerqueue);
spin_lock_init(&alarm_bases[i].lock);
- hrtimer_init(&alarm_bases[i].timer,
- alarm_bases[i].base_clockid,
- HRTIMER_MODE_ABS);
- alarm_bases[i].timer.function = alarmtimer_fired;
}
error = alarmtimer_rtc_interface_setup();
@@ -821,6 +788,7 @@ static int __init alarmtimer_init(void)
error = PTR_ERR(pdev);
goto out_drv;
}
+ ws = wakeup_source_register("alarmtimer");
return 0;
out_drv:
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 7e1ce012a851..30b6de0d977c 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -397,6 +397,30 @@ void clockevents_exchange_device(struct clock_event_device *old,
local_irq_restore(flags);
}
+/**
+ * clockevents_suspend - suspend clock devices
+ */
+void clockevents_suspend(void)
+{
+ struct clock_event_device *dev;
+
+ list_for_each_entry_reverse(dev, &clockevent_devices, list)
+ if (dev->suspend)
+ dev->suspend(dev);
+}
+
+/**
+ * clockevents_resume - resume clock devices
+ */
+void clockevents_resume(void)
+{
+ struct clock_event_device *dev;
+
+ list_for_each_entry(dev, &clockevent_devices, list)
+ if (dev->resume)
+ dev->resume(dev);
+}
+
#ifdef CONFIG_GENERIC_CLOCKEVENTS
/**
* clockevents_notify - notification about relevant events
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 46da0537c10b..6629bf7b5285 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
* requested HZ value. It is also not recommended
* for "tick-less" systems.
*/
-#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ))
+#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ)
/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
* conversion, the .shift value could be zero. However
@@ -95,3 +95,33 @@ struct clocksource * __init __weak clocksource_default_clock(void)
{
return &clocksource_jiffies;
}
+
+struct clocksource refined_jiffies;
+
+int register_refined_jiffies(long cycles_per_second)
+{
+ u64 nsec_per_tick, shift_hz;
+ long cycles_per_tick;
+
+
+
+ refined_jiffies = clocksource_jiffies;
+ refined_jiffies.name = "refined-jiffies";
+ refined_jiffies.rating++;
+
+ /* Calc cycles per tick */
+ cycles_per_tick = (cycles_per_second + HZ/2)/HZ;
+ /* shift_hz stores hz<<8 for extra accuracy */
+ shift_hz = (u64)cycles_per_second << 8;
+ shift_hz += cycles_per_tick/2;
+ do_div(shift_hz, cycles_per_tick);
+ /* Calculate nsec_per_tick using shift_hz */
+ nsec_per_tick = (u64)NSEC_PER_SEC << 8;
+ nsec_per_tick += (u32)shift_hz/2;
+ do_div(nsec_per_tick, (u32)shift_hz);
+
+ refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
+
+ clocksource_register(&refined_jiffies);
+ return 0;
+}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f423bdd035c2..a40260885265 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -835,7 +835,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
*/
if (ts->tick_stopped) {
touch_softlockup_watchdog();
- if (idle_cpu(cpu))
+ if (is_idle_task(current))
ts->idle_jiffies++;
}
update_process_times(user_mode(regs));
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d3b91e75cecd..e424970bb562 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -8,6 +8,7 @@
*
*/
+#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
@@ -21,61 +22,6 @@
#include <linux/tick.h>
#include <linux/stop_machine.h>
-/* Structure holding internal timekeeping values. */
-struct timekeeper {
- /* Current clocksource used for timekeeping. */
- struct clocksource *clock;
- /* NTP adjusted clock multiplier */
- u32 mult;
- /* The shift value of the current clocksource. */
- u32 shift;
- /* Number of clock cycles in one NTP interval. */
- cycle_t cycle_interval;
- /* Number of clock shifted nano seconds in one NTP interval. */
- u64 xtime_interval;
- /* shifted nano seconds left over when rounding cycle_interval */
- s64 xtime_remainder;
- /* Raw nano seconds accumulated per NTP interval. */
- u32 raw_interval;
-
- /* Current CLOCK_REALTIME time in seconds */
- u64 xtime_sec;
- /* Clock shifted nano seconds */
- u64 xtime_nsec;
-
- /* Difference between accumulated time and NTP time in ntp
- * shifted nano seconds. */
- s64 ntp_error;
- /* Shift conversion between clock shifted nano seconds and
- * ntp shifted nano seconds. */
- u32 ntp_error_shift;
-
- /*
- * wall_to_monotonic is what we need to add to xtime (or xtime corrected
- * for sub jiffie times) to get to monotonic time. Monotonic is pegged
- * at zero at system boot time, so wall_to_monotonic will be negative,
- * however, we will ALWAYS keep the tv_nsec part positive so we can use
- * the usual normalization.
- *
- * wall_to_monotonic is moved after resume from suspend for the
- * monotonic time not to jump. We need to add total_sleep_time to
- * wall_to_monotonic to get the real boot based time offset.
- *
- * - wall_to_monotonic is no longer the boot time, getboottime must be
- * used instead.
- */
- struct timespec wall_to_monotonic;
- /* Offset clock monotonic -> clock realtime */
- ktime_t offs_real;
- /* time spent in suspend */
- struct timespec total_sleep_time;
- /* Offset clock monotonic -> clock boottime */
- ktime_t offs_boot;
- /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
- struct timespec raw_time;
- /* Seqlock for all timekeeper values */
- seqlock_t lock;
-};
static struct timekeeper timekeeper;
@@ -96,15 +42,6 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
}
}
-static struct timespec tk_xtime(struct timekeeper *tk)
-{
- struct timespec ts;
-
- ts.tv_sec = tk->xtime_sec;
- ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
- return ts;
-}
-
static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
{
tk->xtime_sec = ts->tv_sec;
@@ -246,14 +183,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
/* must hold write on timekeeper.lock */
static void timekeeping_update(struct timekeeper *tk, bool clearntp)
{
- struct timespec xt;
-
if (clearntp) {
tk->ntp_error = 0;
ntp_clear();
}
- xt = tk_xtime(tk);
- update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
+ update_vsyscall(tk);
}
/**
@@ -776,6 +710,7 @@ static void timekeeping_resume(void)
read_persistent_clock(&ts);
+ clockevents_resume();
clocksource_resume();
write_seqlock_irqsave(&tk->lock, flags);
@@ -835,6 +770,7 @@ static int timekeeping_suspend(void)
clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
clocksource_suspend();
+ clockevents_suspend();
return 0;
}
@@ -1111,7 +1047,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
- raw_nsecs = tk->raw_interval << shift;
+ raw_nsecs = (u64)tk->raw_interval << shift;
raw_nsecs += tk->raw_time.tv_nsec;
if (raw_nsecs >= NSEC_PER_SEC) {
u64 raw_secs = raw_nsecs;
@@ -1128,6 +1064,33 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
return offset;
}
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
+static inline void old_vsyscall_fixup(struct timekeeper *tk)
+{
+ s64 remainder;
+
+ /*
+ * Store only full nanoseconds into xtime_nsec after rounding
+ * it up and add the remainder to the error difference.
+ * XXX - This is necessary to avoid small 1ns inconsistnecies caused
+ * by truncating the remainder in vsyscalls. However, it causes
+ * additional work to be done in timekeeping_adjust(). Once
+ * the vsyscall implementations are converted to use xtime_nsec
+ * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
+ * users are removed, this can be killed.
+ */
+ remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
+ tk->xtime_nsec -= remainder;
+ tk->xtime_nsec += 1ULL << tk->shift;
+ tk->ntp_error += remainder << tk->ntp_error_shift;
+
+}
+#else
+#define old_vsyscall_fixup(tk)
+#endif
+
+
+
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
@@ -1139,7 +1102,6 @@ static void update_wall_time(void)
cycle_t offset;
int shift = 0, maxshift;
unsigned long flags;
- s64 remainder;
write_seqlock_irqsave(&tk->lock, flags);
@@ -1181,20 +1143,11 @@ static void update_wall_time(void)
/* correct the clock when NTP error is too big */
timekeeping_adjust(tk, offset);
-
/*
- * Store only full nanoseconds into xtime_nsec after rounding
- * it up and add the remainder to the error difference.
- * XXX - This is necessary to avoid small 1ns inconsistnecies caused
- * by truncating the remainder in vsyscalls. However, it causes
- * additional work to be done in timekeeping_adjust(). Once
- * the vsyscall implementations are converted to use xtime_nsec
- * (shifted nanoseconds), this can be killed.
- */
- remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
- tk->xtime_nsec -= remainder;
- tk->xtime_nsec += 1ULL << tk->shift;
- tk->ntp_error += remainder << tk->ntp_error_shift;
+ * XXX This can be killed once everyone converts
+ * to the new update_vsyscall.
+ */
+ old_vsyscall_fixup(tk);
/*
* Finally, make sure that after the rounding
diff --git a/kernel/timer.c b/kernel/timer.c
index d5de1b2292aa..367d00858482 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64);
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
+#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
struct tvec {
struct list_head vec[TVN_SIZE];
@@ -359,11 +360,12 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
} else {
int i;
- /* If the timeout is larger than 0xffffffff on 64-bit
- * architectures then we use the maximum timeout:
+ /* If the timeout is larger than MAX_TVAL (on 64-bit
+ * architectures or with CONFIG_BASE_SMALL=1) then we
+ * use the maximum timeout.
*/
- if (idx > 0xffffffffUL) {
- idx = 0xffffffffUL;
+ if (idx > MAX_TVAL) {
+ idx = MAX_TVAL;
expires = idx + base->timer_jiffies;
}
i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1ec5c1dab629..31e4f55773f1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2061,7 +2061,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
seq_puts(m, "# -----------------\n");
seq_printf(m, "# | task: %.16s-%d "
"(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
- data->comm, data->pid, data->uid, data->nice,
+ data->comm, data->pid,
+ from_kuid_munged(seq_user_ns(m), data->uid), data->nice,
data->policy, data->rt_priority);
seq_puts(m, "# -----------------\n");
@@ -4199,12 +4200,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
buf->private = 0;
}
-static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
-{
- return 1;
-}
-
static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -4220,7 +4215,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = {
.unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
- .steal = buffer_pipe_buf_steal,
+ .steal = generic_pipe_buf_steal,
.get = buffer_pipe_buf_get,
};
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 63a2da0b9a6e..c15f528c1af4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -147,7 +147,7 @@ struct trace_array_cpu {
unsigned long skipped_entries;
cycle_t preempt_timestamp;
pid_t pid;
- uid_t uid;
+ kuid_t uid;
char comm[TASK_COMM_LEN];
};
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 483162a9f908..507a7a9630bf 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,7 +13,6 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
-#include <linux/pstore.h>
#include <linux/fs.h>
#include "trace.h"
@@ -76,10 +75,9 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
preempt_enable_notrace();
}
-/* Our two options */
+/* Our option */
enum {
TRACE_FUNC_OPT_STACK = 0x1,
- TRACE_FUNC_OPT_PSTORE = 0x2,
};
static struct tracer_flags func_flags;
@@ -109,12 +107,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
- /*
- * So far tracing doesn't support multiple buffers, so
- * we make an explicit call for now.
- */
- if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE))
- pstore_ftrace_call(ip, parent_ip);
pc = preempt_count();
trace_function(tr, ip, parent_ip, flags, pc);
}
@@ -181,9 +173,6 @@ static struct tracer_opt func_opts[] = {
#ifdef CONFIG_STACKTRACE
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
#endif
-#ifdef CONFIG_PSTORE_FTRACE
- { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) },
-#endif
{ } /* Always set a last empty entry */
};
@@ -236,8 +225,6 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
}
break;
- case TRACE_FUNC_OPT_PSTORE:
- break;
default:
return -EINVAL;
}
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 23b4d784ebdd..625df0b44690 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -26,7 +26,9 @@
/*
* fill in basic accounting fields
*/
-void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+void bacct_add_tsk(struct user_namespace *user_ns,
+ struct pid_namespace *pid_ns,
+ struct taskstats *stats, struct task_struct *tsk)
{
const struct cred *tcred;
struct timespec uptime, ts;
@@ -55,13 +57,13 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
stats->ac_flag |= AXSIG;
stats->ac_nice = task_nice(tsk);
stats->ac_sched = tsk->policy;
- stats->ac_pid = tsk->pid;
+ stats->ac_pid = task_pid_nr_ns(tsk, pid_ns);
rcu_read_lock();
tcred = __task_cred(tsk);
- stats->ac_uid = tcred->uid;
- stats->ac_gid = tcred->gid;
+ stats->ac_uid = from_kuid_munged(user_ns, tcred->uid);
+ stats->ac_gid = from_kgid_munged(user_ns, tcred->gid);
stats->ac_ppid = pid_alive(tsk) ?
- rcu_dereference(tsk->real_parent)->tgid : 0;
+ task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
rcu_read_unlock();
stats->ac_utime = cputime_to_usecs(tsk->utime);
stats->ac_stime = cputime_to_usecs(tsk->stime);
diff --git a/kernel/user.c b/kernel/user.c
index b815fefbe76f..750acffbe9ec 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -38,6 +38,14 @@ struct user_namespace init_user_ns = {
.count = 4294967295U,
},
},
+ .projid_map = {
+ .nr_extents = 1,
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
+ },
.kref = {
.refcount = ATOMIC_INIT(3),
},
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 86602316422d..456a6b9fba34 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -19,6 +19,7 @@
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
+#include <linux/projid.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
@@ -295,6 +296,75 @@ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
}
EXPORT_SYMBOL(from_kgid_munged);
+/**
+ * make_kprojid - Map a user-namespace projid pair into a kprojid.
+ * @ns: User namespace that the projid is in
+ * @projid: Project identifier
+ *
+ * Maps a user-namespace uid pair into a kernel internal kuid,
+ * and returns that kuid.
+ *
+ * When there is no mapping defined for the user-namespace projid
+ * pair INVALID_PROJID is returned. Callers are expected to test
+ * for and handle handle INVALID_PROJID being returned. INVALID_PROJID
+ * may be tested for using projid_valid().
+ */
+kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
+{
+ /* Map the uid to a global kernel uid */
+ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
+}
+EXPORT_SYMBOL(make_kprojid);
+
+/**
+ * from_kprojid - Create a projid from a kprojid user-namespace pair.
+ * @targ: The user namespace we want a projid in.
+ * @kprojid: The kernel internal project identifier to start with.
+ *
+ * Map @kprojid into the user-namespace specified by @targ and
+ * return the resulting projid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * If @kprojid has no mapping in @targ (projid_t)-1 is returned.
+ */
+projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
+{
+ /* Map the uid from a global kernel uid */
+ return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
+}
+EXPORT_SYMBOL(from_kprojid);
+
+/**
+ * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
+ * @targ: The user namespace we want a projid in.
+ * @kprojid: The kernel internal projid to start with.
+ *
+ * Map @kprojid into the user-namespace specified by @targ and
+ * return the resulting projid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * Unlike from_kprojid from_kprojid_munged never fails and always
+ * returns a valid projid. This makes from_kprojid_munged
+ * appropriate for use in syscalls like stat and where
+ * failing the system call and failing to provide a valid projid are
+ * not an options.
+ *
+ * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
+ */
+projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
+{
+ projid_t projid;
+ projid = from_kprojid(targ, kprojid);
+
+ if (projid == (projid_t) -1)
+ projid = OVERFLOW_PROJID;
+ return projid;
+}
+EXPORT_SYMBOL(from_kprojid_munged);
+
+
static int uid_m_show(struct seq_file *seq, void *v)
{
struct user_namespace *ns = seq->private;
@@ -337,6 +407,27 @@ static int gid_m_show(struct seq_file *seq, void *v)
return 0;
}
+static int projid_m_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ struct uid_gid_extent *extent = v;
+ struct user_namespace *lower_ns;
+ projid_t lower;
+
+ lower_ns = seq_user_ns(seq);
+ if ((lower_ns == ns) && lower_ns->parent)
+ lower_ns = lower_ns->parent;
+
+ lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));
+
+ seq_printf(seq, "%10u %10u %10u\n",
+ extent->first,
+ lower,
+ extent->count);
+
+ return 0;
+}
+
static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map)
{
struct uid_gid_extent *extent = NULL;
@@ -362,6 +453,13 @@ static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
return m_start(seq, ppos, &ns->gid_map);
}
+static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
+{
+ struct user_namespace *ns = seq->private;
+
+ return m_start(seq, ppos, &ns->projid_map);
+}
+
static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
{
(*pos)++;
@@ -387,6 +485,13 @@ struct seq_operations proc_gid_seq_operations = {
.show = gid_m_show,
};
+struct seq_operations proc_projid_seq_operations = {
+ .start = projid_m_start,
+ .stop = m_stop,
+ .next = m_next,
+ .show = projid_m_show,
+};
+
static DEFINE_MUTEX(id_map_mutex);
static ssize_t map_write(struct file *file, const char __user *buf,
@@ -434,7 +539,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
/* Require the appropriate privilege CAP_SETUID or CAP_SETGID
* over the user namespace in order to set the id mapping.
*/
- if (!ns_capable(ns, cap_setid))
+ if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid))
goto out;
/* Get a buffer */
@@ -584,9 +689,30 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
&ns->gid_map, &ns->parent->gid_map);
}
+ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ struct user_namespace *seq_ns = seq_user_ns(seq);
+
+ if (!ns->parent)
+ return -EPERM;
+
+ if ((seq_ns != ns) && (seq_ns != ns->parent))
+ return -EPERM;
+
+ /* Anyone can set any valid project id no capability needed */
+ return map_write(file, buf, size, ppos, -1,
+ &ns->projid_map, &ns->parent->projid_map);
+}
+
static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
+ /* Allow anyone to set a mapping that doesn't require privilege */
+ if (!cap_valid(cap_setid))
+ return true;
+
/* Allow the specified ids if we have the appropriate capability
* (CAP_SETUID or CAP_SETGID) over the parent user namespace.
*/
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3c5a79e2134c..d951daa0ca9a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -58,7 +58,7 @@ enum {
* be executing on any CPU. The gcwq behaves as an unbound one.
*
* Note that DISASSOCIATED can be flipped only while holding
- * managership of all pools on the gcwq to avoid changing binding
+ * assoc_mutex of all pools on the gcwq to avoid changing binding
* state while create_worker() is in progress.
*/
GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
@@ -73,11 +73,10 @@ enum {
WORKER_DIE = 1 << 1, /* die die die */
WORKER_IDLE = 1 << 2, /* is idle */
WORKER_PREP = 1 << 3, /* preparing to run works */
- WORKER_REBIND = 1 << 5, /* mom is home, come back */
WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
WORKER_UNBOUND = 1 << 7, /* worker is unbound */
- WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
+ WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND |
WORKER_CPU_INTENSIVE,
NR_WORKER_POOLS = 2, /* # worker pools per gcwq */
@@ -126,7 +125,6 @@ enum {
struct global_cwq;
struct worker_pool;
-struct idle_rebind;
/*
* The poor guys doing the actual heavy lifting. All on-duty workers
@@ -150,7 +148,6 @@ struct worker {
int id; /* I: worker id */
/* for rebinding worker to CPU */
- struct idle_rebind *idle_rebind; /* L: for idle worker */
struct work_struct rebind_work; /* L: for busy worker */
};
@@ -160,13 +157,15 @@ struct worker_pool {
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
+
+ /* nr_idle includes the ones off idle_list for rebinding */
int nr_idle; /* L: currently idle ones */
struct list_head idle_list; /* X: list of idle workers */
struct timer_list idle_timer; /* L: worker idle timeout */
struct timer_list mayday_timer; /* L: SOS timer for workers */
- struct mutex manager_mutex; /* mutex manager should hold */
+ struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */
struct ida worker_ida; /* L: for worker IDs */
};
@@ -184,9 +183,8 @@ struct global_cwq {
struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
/* L: hash of busy workers */
- struct worker_pool pools[2]; /* normal and highpri pools */
-
- wait_queue_head_t rebind_hold; /* rebind hold wait */
+ struct worker_pool pools[NR_WORKER_POOLS];
+ /* normal and highpri pools */
} ____cacheline_aligned_in_smp;
/*
@@ -269,17 +267,15 @@ struct workqueue_struct {
};
struct workqueue_struct *system_wq __read_mostly;
-struct workqueue_struct *system_long_wq __read_mostly;
-struct workqueue_struct *system_nrt_wq __read_mostly;
-struct workqueue_struct *system_unbound_wq __read_mostly;
-struct workqueue_struct *system_freezable_wq __read_mostly;
-struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_wq);
+struct workqueue_struct *system_highpri_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_highpri_wq);
+struct workqueue_struct *system_long_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_long_wq);
-EXPORT_SYMBOL_GPL(system_nrt_wq);
+struct workqueue_struct *system_unbound_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_unbound_wq);
+struct workqueue_struct *system_freezable_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_wq);
-EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
@@ -534,18 +530,24 @@ static int work_next_color(int color)
}
/*
- * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
- * work is on queue. Once execution starts, WORK_STRUCT_CWQ is
- * cleared and the work data contains the cpu number it was last on.
+ * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data
+ * contain the pointer to the queued cwq. Once execution starts, the flag
+ * is cleared and the high bits contain OFFQ flags and CPU number.
*
- * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
- * cwq, cpu or clear work->data. These functions should only be
- * called while the work is owned - ie. while the PENDING bit is set.
+ * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling()
+ * and clear_work_data() can be used to set the cwq, cpu or clear
+ * work->data. These functions should only be called while the work is
+ * owned - ie. while the PENDING bit is set.
*
- * get_work_[g]cwq() can be used to obtain the gcwq or cwq
- * corresponding to a work. gcwq is available once the work has been
- * queued anywhere after initialization. cwq is available only from
- * queueing until execution starts.
+ * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to
+ * a work. gcwq is available once the work has been queued anywhere after
+ * initialization until it is sync canceled. cwq is available only while
+ * the work item is queued.
+ *
+ * %WORK_OFFQ_CANCELING is used to mark a work item which is being
+ * canceled. While being canceled, a work item may have its PENDING set
+ * but stay off timer and worklist for arbitrarily long and nobody should
+ * try to steal the PENDING bit.
*/
static inline void set_work_data(struct work_struct *work, unsigned long data,
unsigned long flags)
@@ -562,13 +564,22 @@ static void set_work_cwq(struct work_struct *work,
WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
}
-static void set_work_cpu(struct work_struct *work, unsigned int cpu)
+static void set_work_cpu_and_clear_pending(struct work_struct *work,
+ unsigned int cpu)
{
- set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
+ /*
+ * The following wmb is paired with the implied mb in
+ * test_and_set_bit(PENDING) and ensures all updates to @work made
+ * here are visible to and precede any updates by the next PENDING
+ * owner.
+ */
+ smp_wmb();
+ set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
}
static void clear_work_data(struct work_struct *work)
{
+ smp_wmb(); /* see set_work_cpu_and_clear_pending() */
set_work_data(work, WORK_STRUCT_NO_CPU, 0);
}
@@ -591,7 +602,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
return ((struct cpu_workqueue_struct *)
(data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
- cpu = data >> WORK_STRUCT_FLAG_BITS;
+ cpu = data >> WORK_OFFQ_CPU_SHIFT;
if (cpu == WORK_CPU_NONE)
return NULL;
@@ -599,6 +610,22 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
return get_gcwq(cpu);
}
+static void mark_work_canceling(struct work_struct *work)
+{
+ struct global_cwq *gcwq = get_work_gcwq(work);
+ unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
+
+ set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING,
+ WORK_STRUCT_PENDING);
+}
+
+static bool work_is_canceling(struct work_struct *work)
+{
+ unsigned long data = atomic_long_read(&work->data);
+
+ return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING);
+}
+
/*
* Policy functions. These define the policies on how the global worker
* pools are managed. Unless noted otherwise, these functions assume that
@@ -657,6 +684,13 @@ static bool too_many_workers(struct worker_pool *pool)
int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
int nr_busy = pool->nr_workers - nr_idle;
+ /*
+ * nr_idle and idle_list may disagree if idle rebinding is in
+ * progress. Never return %true if idle_list is empty.
+ */
+ if (list_empty(&pool->idle_list))
+ return false;
+
return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}
@@ -903,6 +937,206 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
}
/**
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out paramter for nested worklist walking
+ *
+ * Schedule linked works starting from @work to @head. Work series to
+ * be scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of
+ * the last scheduled work. This allows move_linked_works() to be
+ * nested inside outer list_for_each_entry_safe().
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+ struct work_struct **nextp)
+{
+ struct work_struct *n;
+
+ /*
+ * Linked worklist will always end before the end of the list,
+ * use NULL for list head.
+ */
+ list_for_each_entry_safe_from(work, n, NULL, entry) {
+ list_move_tail(&work->entry, head);
+ if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
+ break;
+ }
+
+ /*
+ * If we're already inside safe list traversal and have moved
+ * multiple works to the scheduled queue, the next position
+ * needs to be updated.
+ */
+ if (nextp)
+ *nextp = n;
+}
+
+static void cwq_activate_delayed_work(struct work_struct *work)
+{
+ struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+
+ trace_workqueue_activate_work(work);
+ move_linked_works(work, &cwq->pool->worklist, NULL);
+ __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
+ cwq->nr_active++;
+}
+
+static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+{
+ struct work_struct *work = list_first_entry(&cwq->delayed_works,
+ struct work_struct, entry);
+
+ cwq_activate_delayed_work(work);
+}
+
+/**
+ * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
+ * @cwq: cwq of interest
+ * @color: color of work which left the queue
+ *
+ * A work either has completed or is removed from pending queue,
+ * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
+{
+ /* ignore uncolored works */
+ if (color == WORK_NO_COLOR)
+ return;
+
+ cwq->nr_in_flight[color]--;
+
+ cwq->nr_active--;
+ if (!list_empty(&cwq->delayed_works)) {
+ /* one down, submit a delayed one */
+ if (cwq->nr_active < cwq->max_active)
+ cwq_activate_first_delayed(cwq);
+ }
+
+ /* is flush in progress and are we at the flushing tip? */
+ if (likely(cwq->flush_color != color))
+ return;
+
+ /* are there still in-flight works? */
+ if (cwq->nr_in_flight[color])
+ return;
+
+ /* this cwq is done, clear flush_color */
+ cwq->flush_color = -1;
+
+ /*
+ * If this was the last cwq, wake up the first flusher. It
+ * will handle the rest.
+ */
+ if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
+ complete(&cwq->wq->first_flusher->done);
+}
+
+/**
+ * try_to_grab_pending - steal work item from worklist and disable irq
+ * @work: work item to steal
+ * @is_dwork: @work is a delayed_work
+ * @flags: place to store irq state
+ *
+ * Try to grab PENDING bit of @work. This function can handle @work in any
+ * stable state - idle, on timer or on worklist. Return values are
+ *
+ * 1 if @work was pending and we successfully stole PENDING
+ * 0 if @work was idle and we claimed PENDING
+ * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
+ * -ENOENT if someone else is canceling @work, this state may persist
+ * for arbitrarily long
+ *
+ * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting
+ * interrupted while holding PENDING and @work off queue, irq must be
+ * disabled on entry. This, combined with delayed_work->timer being
+ * irqsafe, ensures that we return -EAGAIN for finite short period of time.
+ *
+ * On successful return, >= 0, irq is disabled and the caller is
+ * responsible for releasing it using local_irq_restore(*@flags).
+ *
+ * This function is safe to call from any context including IRQ handler.
+ */
+static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
+ unsigned long *flags)
+{
+ struct global_cwq *gcwq;
+
+ local_irq_save(*flags);
+
+ /* try to steal the timer if it exists */
+ if (is_dwork) {
+ struct delayed_work *dwork = to_delayed_work(work);
+
+ /*
+ * dwork->timer is irqsafe. If del_timer() fails, it's
+ * guaranteed that the timer is not queued anywhere and not
+ * running on the local CPU.
+ */
+ if (likely(del_timer(&dwork->timer)))
+ return 1;
+ }
+
+ /* try to claim PENDING the normal way */
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
+ return 0;
+
+ /*
+ * The queueing is in progress, or it is already queued. Try to
+ * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+ */
+ gcwq = get_work_gcwq(work);
+ if (!gcwq)
+ goto fail;
+
+ spin_lock(&gcwq->lock);
+ if (!list_empty(&work->entry)) {
+ /*
+ * This work is queued, but perhaps we locked the wrong gcwq.
+ * In that case we must see the new value after rmb(), see
+ * insert_work()->wmb().
+ */
+ smp_rmb();
+ if (gcwq == get_work_gcwq(work)) {
+ debug_work_deactivate(work);
+
+ /*
+ * A delayed work item cannot be grabbed directly
+ * because it might have linked NO_COLOR work items
+ * which, if left on the delayed_list, will confuse
+ * cwq->nr_active management later on and cause
+ * stall. Make sure the work item is activated
+ * before grabbing.
+ */
+ if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
+ cwq_activate_delayed_work(work);
+
+ list_del_init(&work->entry);
+ cwq_dec_nr_in_flight(get_work_cwq(work),
+ get_work_color(work));
+
+ spin_unlock(&gcwq->lock);
+ return 1;
+ }
+ }
+ spin_unlock(&gcwq->lock);
+fail:
+ local_irq_restore(*flags);
+ if (work_is_canceling(work))
+ return -ENOENT;
+ cpu_relax();
+ return -EAGAIN;
+}
+
+/**
* insert_work - insert a work into gcwq
* @cwq: cwq @work belongs to
* @work: work to insert
@@ -982,7 +1216,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
struct cpu_workqueue_struct *cwq;
struct list_head *worklist;
unsigned int work_flags;
- unsigned long flags;
+ unsigned int req_cpu = cpu;
+
+ /*
+ * While a work item is PENDING && off queue, a task trying to
+ * steal the PENDING will busy-loop waiting for it to either get
+ * queued or lose PENDING. Grabbing PENDING and queueing should
+ * happen with IRQ disabled.
+ */
+ WARN_ON_ONCE(!irqs_disabled());
debug_work_activate(work);
@@ -995,21 +1237,22 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
if (!(wq->flags & WQ_UNBOUND)) {
struct global_cwq *last_gcwq;
- if (unlikely(cpu == WORK_CPU_UNBOUND))
+ if (cpu == WORK_CPU_UNBOUND)
cpu = raw_smp_processor_id();
/*
- * It's multi cpu. If @wq is non-reentrant and @work
- * was previously on a different cpu, it might still
- * be running there, in which case the work needs to
- * be queued on that cpu to guarantee non-reentrance.
+ * It's multi cpu. If @work was previously on a different
+ * cpu, it might still be running there, in which case the
+ * work needs to be queued on that cpu to guarantee
+ * non-reentrancy.
*/
gcwq = get_gcwq(cpu);
- if (wq->flags & WQ_NON_REENTRANT &&
- (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
+ last_gcwq = get_work_gcwq(work);
+
+ if (last_gcwq && last_gcwq != gcwq) {
struct worker *worker;
- spin_lock_irqsave(&last_gcwq->lock, flags);
+ spin_lock(&last_gcwq->lock);
worker = find_worker_executing_work(last_gcwq, work);
@@ -1017,22 +1260,23 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
gcwq = last_gcwq;
else {
/* meh... not running there, queue here */
- spin_unlock_irqrestore(&last_gcwq->lock, flags);
- spin_lock_irqsave(&gcwq->lock, flags);
+ spin_unlock(&last_gcwq->lock);
+ spin_lock(&gcwq->lock);
}
- } else
- spin_lock_irqsave(&gcwq->lock, flags);
+ } else {
+ spin_lock(&gcwq->lock);
+ }
} else {
gcwq = get_gcwq(WORK_CPU_UNBOUND);
- spin_lock_irqsave(&gcwq->lock, flags);
+ spin_lock(&gcwq->lock);
}
/* gcwq determined, get cwq and queue */
cwq = get_cwq(gcwq->cpu, wq);
- trace_workqueue_queue_work(cpu, cwq, work);
+ trace_workqueue_queue_work(req_cpu, cwq, work);
if (WARN_ON(!list_empty(&work->entry))) {
- spin_unlock_irqrestore(&gcwq->lock, flags);
+ spin_unlock(&gcwq->lock);
return;
}
@@ -1050,79 +1294,110 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
insert_work(cwq, work, worklist, work_flags);
- spin_unlock_irqrestore(&gcwq->lock, flags);
+ spin_unlock(&gcwq->lock);
}
/**
- * queue_work - queue work on a workqueue
+ * queue_work_on - queue work on specific cpu
+ * @cpu: CPU number to execute work on
* @wq: workqueue to use
* @work: work to queue
*
- * Returns 0 if @work was already on a queue, non-zero otherwise.
+ * Returns %false if @work was already on a queue, %true otherwise.
*
- * We queue the work to the CPU on which it was submitted, but if the CPU dies
- * it can be processed by another CPU.
+ * We queue the work to a specific CPU, the caller must ensure it
+ * can't go away.
*/
-int queue_work(struct workqueue_struct *wq, struct work_struct *work)
+bool queue_work_on(int cpu, struct workqueue_struct *wq,
+ struct work_struct *work)
{
- int ret;
+ bool ret = false;
+ unsigned long flags;
- ret = queue_work_on(get_cpu(), wq, work);
- put_cpu();
+ local_irq_save(flags);
+
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ __queue_work(cpu, wq, work);
+ ret = true;
+ }
+ local_irq_restore(flags);
return ret;
}
-EXPORT_SYMBOL_GPL(queue_work);
+EXPORT_SYMBOL_GPL(queue_work_on);
/**
- * queue_work_on - queue work on specific cpu
- * @cpu: CPU number to execute work on
+ * queue_work - queue work on a workqueue
* @wq: workqueue to use
* @work: work to queue
*
- * Returns 0 if @work was already on a queue, non-zero otherwise.
+ * Returns %false if @work was already on a queue, %true otherwise.
*
- * We queue the work to a specific CPU, the caller must ensure it
- * can't go away.
+ * We queue the work to the CPU on which it was submitted, but if the CPU dies
+ * it can be processed by another CPU.
*/
-int
-queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
+bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
{
- int ret = 0;
-
- if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
- __queue_work(cpu, wq, work);
- ret = 1;
- }
- return ret;
+ return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}
-EXPORT_SYMBOL_GPL(queue_work_on);
+EXPORT_SYMBOL_GPL(queue_work);
-static void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(unsigned long __data)
{
struct delayed_work *dwork = (struct delayed_work *)__data;
struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
- __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
+ /* should have been called from irqsafe timer with irq already off */
+ __queue_work(dwork->cpu, cwq->wq, &dwork->work);
}
+EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
-/**
- * queue_delayed_work - queue work on a workqueue after delay
- * @wq: workqueue to use
- * @dwork: delayable work to queue
- * @delay: number of jiffies to wait before queueing
- *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
- */
-int queue_delayed_work(struct workqueue_struct *wq,
- struct delayed_work *dwork, unsigned long delay)
+static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
+ struct delayed_work *dwork, unsigned long delay)
{
- if (delay == 0)
- return queue_work(wq, &dwork->work);
+ struct timer_list *timer = &dwork->timer;
+ struct work_struct *work = &dwork->work;
+ unsigned int lcpu;
+
+ WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
+ timer->data != (unsigned long)dwork);
+ BUG_ON(timer_pending(timer));
+ BUG_ON(!list_empty(&work->entry));
+
+ timer_stats_timer_set_start_info(&dwork->timer);
+
+ /*
+ * This stores cwq for the moment, for the timer_fn. Note that the
+ * work's gcwq is preserved to allow reentrance detection for
+ * delayed works.
+ */
+ if (!(wq->flags & WQ_UNBOUND)) {
+ struct global_cwq *gcwq = get_work_gcwq(work);
- return queue_delayed_work_on(-1, wq, dwork, delay);
+ /*
+ * If we cannot get the last gcwq from @work directly,
+ * select the last CPU such that it avoids unnecessarily
+ * triggering non-reentrancy check in __queue_work().
+ */
+ lcpu = cpu;
+ if (gcwq)
+ lcpu = gcwq->cpu;
+ if (lcpu == WORK_CPU_UNBOUND)
+ lcpu = raw_smp_processor_id();
+ } else {
+ lcpu = WORK_CPU_UNBOUND;
+ }
+
+ set_work_cwq(work, get_cwq(lcpu, wq), 0);
+
+ dwork->cpu = cpu;
+ timer->expires = jiffies + delay;
+
+ if (unlikely(cpu != WORK_CPU_UNBOUND))
+ add_timer_on(timer, cpu);
+ else
+ add_timer(timer);
}
-EXPORT_SYMBOL_GPL(queue_delayed_work);
/**
* queue_delayed_work_on - queue work on specific CPU after delay
@@ -1131,53 +1406,100 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
* @dwork: work to queue
* @delay: number of jiffies to wait before queueing
*
- * Returns 0 if @work was already on a queue, non-zero otherwise.
+ * Returns %false if @work was already on a queue, %true otherwise. If
+ * @delay is zero and @dwork is idle, it will be scheduled for immediate
+ * execution.
*/
-int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
- struct delayed_work *dwork, unsigned long delay)
+bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+ struct delayed_work *dwork, unsigned long delay)
{
- int ret = 0;
- struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
+ bool ret = false;
+ unsigned long flags;
- if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
- unsigned int lcpu;
+ if (!delay)
+ return queue_work_on(cpu, wq, &dwork->work);
- BUG_ON(timer_pending(timer));
- BUG_ON(!list_empty(&work->entry));
+ /* read the comment in __queue_work() */
+ local_irq_save(flags);
- timer_stats_timer_set_start_info(&dwork->timer);
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ __queue_delayed_work(cpu, wq, dwork, delay);
+ ret = true;
+ }
- /*
- * This stores cwq for the moment, for the timer_fn.
- * Note that the work's gcwq is preserved to allow
- * reentrance detection for delayed works.
- */
- if (!(wq->flags & WQ_UNBOUND)) {
- struct global_cwq *gcwq = get_work_gcwq(work);
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_delayed_work_on);
- if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
- lcpu = gcwq->cpu;
- else
- lcpu = raw_smp_processor_id();
- } else
- lcpu = WORK_CPU_UNBOUND;
+/**
+ * queue_delayed_work - queue work on a workqueue after delay
+ * @wq: workqueue to use
+ * @dwork: delayable work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
+ */
+bool queue_delayed_work(struct workqueue_struct *wq,
+ struct delayed_work *dwork, unsigned long delay)
+{
+ return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
+}
+EXPORT_SYMBOL_GPL(queue_delayed_work);
- set_work_cwq(work, get_cwq(lcpu, wq), 0);
+/**
+ * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
+ * modify @dwork's timer so that it expires after @delay. If @delay is
+ * zero, @work is guaranteed to be scheduled immediately regardless of its
+ * current state.
+ *
+ * Returns %false if @dwork was idle and queued, %true if @dwork was
+ * pending and its timer was modified.
+ *
+ * This function is safe to call from any context including IRQ handler.
+ * See try_to_grab_pending() for details.
+ */
+bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
+ struct delayed_work *dwork, unsigned long delay)
+{
+ unsigned long flags;
+ int ret;
- timer->expires = jiffies + delay;
- timer->data = (unsigned long)dwork;
- timer->function = delayed_work_timer_fn;
+ do {
+ ret = try_to_grab_pending(&dwork->work, true, &flags);
+ } while (unlikely(ret == -EAGAIN));
- if (unlikely(cpu >= 0))
- add_timer_on(timer, cpu);
- else
- add_timer(timer);
- ret = 1;
+ if (likely(ret >= 0)) {
+ __queue_delayed_work(cpu, wq, dwork, delay);
+ local_irq_restore(flags);
}
+
+ /* -ENOENT from try_to_grab_pending() becomes %true */
return ret;
}
-EXPORT_SYMBOL_GPL(queue_delayed_work_on);
+EXPORT_SYMBOL_GPL(mod_delayed_work_on);
+
+/**
+ * mod_delayed_work - modify delay of or queue a delayed work
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * mod_delayed_work_on() on local CPU.
+ */
+bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
+ unsigned long delay)
+{
+ return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
+}
+EXPORT_SYMBOL_GPL(mod_delayed_work);
/**
* worker_enter_idle - enter idle state
@@ -1305,37 +1627,21 @@ __acquires(&gcwq->lock)
}
}
-struct idle_rebind {
- int cnt; /* # workers to be rebound */
- struct completion done; /* all workers rebound */
-};
-
/*
- * Rebind an idle @worker to its CPU. During CPU onlining, this has to
- * happen synchronously for idle workers. worker_thread() will test
- * %WORKER_REBIND before leaving idle and call this function.
+ * Rebind an idle @worker to its CPU. worker_thread() will test
+ * list_empty(@worker->entry) before leaving idle and call this function.
*/
static void idle_worker_rebind(struct worker *worker)
{
struct global_cwq *gcwq = worker->pool->gcwq;
- /* CPU must be online at this point */
- WARN_ON(!worker_maybe_bind_and_lock(worker));
- if (!--worker->idle_rebind->cnt)
- complete(&worker->idle_rebind->done);
- spin_unlock_irq(&worker->pool->gcwq->lock);
+ /* CPU may go down again inbetween, clear UNBOUND only on success */
+ if (worker_maybe_bind_and_lock(worker))
+ worker_clr_flags(worker, WORKER_UNBOUND);
- /* we did our part, wait for rebind_workers() to finish up */
- wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
-
- /*
- * rebind_workers() shouldn't finish until all workers passed the
- * above WORKER_REBIND wait. Tell it when done.
- */
- spin_lock_irq(&worker->pool->gcwq->lock);
- if (!--worker->idle_rebind->cnt)
- complete(&worker->idle_rebind->done);
- spin_unlock_irq(&worker->pool->gcwq->lock);
+ /* rebind complete, become available again */
+ list_add(&worker->entry, &worker->pool->idle_list);
+ spin_unlock_irq(&gcwq->lock);
}
/*
@@ -1349,16 +1655,8 @@ static void busy_worker_rebind_fn(struct work_struct *work)
struct worker *worker = container_of(work, struct worker, rebind_work);
struct global_cwq *gcwq = worker->pool->gcwq;
- worker_maybe_bind_and_lock(worker);
-
- /*
- * %WORKER_REBIND must be cleared even if the above binding failed;
- * otherwise, we may confuse the next CPU_UP cycle or oops / get
- * stuck by calling idle_worker_rebind() prematurely. If CPU went
- * down again inbetween, %WORKER_UNBOUND would be set, so clearing
- * %WORKER_REBIND is always safe.
- */
- worker_clr_flags(worker, WORKER_REBIND);
+ if (worker_maybe_bind_and_lock(worker))
+ worker_clr_flags(worker, WORKER_UNBOUND);
spin_unlock_irq(&gcwq->lock);
}
@@ -1370,123 +1668,74 @@ static void busy_worker_rebind_fn(struct work_struct *work)
* @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding
* is different for idle and busy ones.
*
- * The idle ones should be rebound synchronously and idle rebinding should
- * be complete before any worker starts executing work items with
- * concurrency management enabled; otherwise, scheduler may oops trying to
- * wake up non-local idle worker from wq_worker_sleeping().
+ * Idle ones will be removed from the idle_list and woken up. They will
+ * add themselves back after completing rebind. This ensures that the
+ * idle_list doesn't contain any unbound workers when re-bound busy workers
+ * try to perform local wake-ups for concurrency management.
*
- * This is achieved by repeatedly requesting rebinding until all idle
- * workers are known to have been rebound under @gcwq->lock and holding all
- * idle workers from becoming busy until idle rebinding is complete.
+ * Busy workers can rebind after they finish their current work items.
+ * Queueing the rebind work item at the head of the scheduled list is
+ * enough. Note that nr_running will be properly bumped as busy workers
+ * rebind.
*
- * Once idle workers are rebound, busy workers can be rebound as they
- * finish executing their current work items. Queueing the rebind work at
- * the head of their scheduled lists is enough. Note that nr_running will
- * be properbly bumped as busy workers rebind.
- *
- * On return, all workers are guaranteed to either be bound or have rebind
- * work item scheduled.
+ * On return, all non-manager workers are scheduled for rebind - see
+ * manage_workers() for the manager special case. Any idle worker
+ * including the manager will not appear on @idle_list until rebind is
+ * complete, making local wake-ups safe.
*/
static void rebind_workers(struct global_cwq *gcwq)
- __releases(&gcwq->lock) __acquires(&gcwq->lock)
{
- struct idle_rebind idle_rebind;
struct worker_pool *pool;
- struct worker *worker;
+ struct worker *worker, *n;
struct hlist_node *pos;
int i;
lockdep_assert_held(&gcwq->lock);
for_each_worker_pool(pool, gcwq)
- lockdep_assert_held(&pool->manager_mutex);
+ lockdep_assert_held(&pool->assoc_mutex);
- /*
- * Rebind idle workers. Interlocked both ways. We wait for
- * workers to rebind via @idle_rebind.done. Workers will wait for
- * us to finish up by watching %WORKER_REBIND.
- */
- init_completion(&idle_rebind.done);
-retry:
- idle_rebind.cnt = 1;
- INIT_COMPLETION(idle_rebind.done);
-
- /* set REBIND and kick idle ones, we'll wait for these later */
+ /* dequeue and kick idle ones */
for_each_worker_pool(pool, gcwq) {
- list_for_each_entry(worker, &pool->idle_list, entry) {
- unsigned long worker_flags = worker->flags;
-
- if (worker->flags & WORKER_REBIND)
- continue;
-
- /* morph UNBOUND to REBIND atomically */
- worker_flags &= ~WORKER_UNBOUND;
- worker_flags |= WORKER_REBIND;
- ACCESS_ONCE(worker->flags) = worker_flags;
-
- idle_rebind.cnt++;
- worker->idle_rebind = &idle_rebind;
+ list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
+ /*
+ * idle workers should be off @pool->idle_list
+ * until rebind is complete to avoid receiving
+ * premature local wake-ups.
+ */
+ list_del_init(&worker->entry);
- /* worker_thread() will call idle_worker_rebind() */
+ /*
+ * worker_thread() will see the above dequeuing
+ * and call idle_worker_rebind().
+ */
wake_up_process(worker->task);
}
}
- if (--idle_rebind.cnt) {
- spin_unlock_irq(&gcwq->lock);
- wait_for_completion(&idle_rebind.done);
- spin_lock_irq(&gcwq->lock);
- /* busy ones might have become idle while waiting, retry */
- goto retry;
- }
-
- /* all idle workers are rebound, rebind busy workers */
+ /* rebind busy workers */
for_each_busy_worker(worker, i, pos, gcwq) {
struct work_struct *rebind_work = &worker->rebind_work;
- unsigned long worker_flags = worker->flags;
-
- /* morph UNBOUND to REBIND atomically */
- worker_flags &= ~WORKER_UNBOUND;
- worker_flags |= WORKER_REBIND;
- ACCESS_ONCE(worker->flags) = worker_flags;
+ struct workqueue_struct *wq;
if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
work_data_bits(rebind_work)))
continue;
- /* wq doesn't matter, use the default one */
debug_work_activate(rebind_work);
- insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
- worker->scheduled.next,
- work_color_to_flags(WORK_NO_COLOR));
- }
-
- /*
- * All idle workers are rebound and waiting for %WORKER_REBIND to
- * be cleared inside idle_worker_rebind(). Clear and release.
- * Clearing %WORKER_REBIND from this foreign context is safe
- * because these workers are still guaranteed to be idle.
- *
- * We need to make sure all idle workers passed WORKER_REBIND wait
- * in idle_worker_rebind() before returning; otherwise, workers can
- * get stuck at the wait if hotplug cycle repeats.
- */
- idle_rebind.cnt = 1;
- INIT_COMPLETION(idle_rebind.done);
-
- for_each_worker_pool(pool, gcwq) {
- list_for_each_entry(worker, &pool->idle_list, entry) {
- worker->flags &= ~WORKER_REBIND;
- idle_rebind.cnt++;
- }
- }
- wake_up_all(&gcwq->rebind_hold);
+ /*
+ * wq doesn't really matter but let's keep @worker->pool
+ * and @cwq->pool consistent for sanity.
+ */
+ if (worker_pool_pri(worker->pool))
+ wq = system_highpri_wq;
+ else
+ wq = system_wq;
- if (--idle_rebind.cnt) {
- spin_unlock_irq(&gcwq->lock);
- wait_for_completion(&idle_rebind.done);
- spin_lock_irq(&gcwq->lock);
+ insert_work(get_cwq(gcwq->cpu, wq), rebind_work,
+ worker->scheduled.next,
+ work_color_to_flags(WORK_NO_COLOR));
}
}
@@ -1844,22 +2093,22 @@ static bool manage_workers(struct worker *worker)
* grab %POOL_MANAGING_WORKERS to achieve this because that can
* lead to idle worker depletion (all become busy thinking someone
* else is managing) which in turn can result in deadlock under
- * extreme circumstances. Use @pool->manager_mutex to synchronize
+ * extreme circumstances. Use @pool->assoc_mutex to synchronize
* manager against CPU hotplug.
*
- * manager_mutex would always be free unless CPU hotplug is in
+ * assoc_mutex would always be free unless CPU hotplug is in
* progress. trylock first without dropping @gcwq->lock.
*/
- if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
+ if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
spin_unlock_irq(&pool->gcwq->lock);
- mutex_lock(&pool->manager_mutex);
+ mutex_lock(&pool->assoc_mutex);
/*
* CPU hotplug could have happened while we were waiting
- * for manager_mutex. Hotplug itself can't handle us
+ * for assoc_mutex. Hotplug itself can't handle us
* because manager isn't either on idle or busy list, and
* @gcwq's state and ours could have deviated.
*
- * As hotplug is now excluded via manager_mutex, we can
+ * As hotplug is now excluded via assoc_mutex, we can
* simply try to bind. It will succeed or fail depending
* on @gcwq's current state. Try it and adjust
* %WORKER_UNBOUND accordingly.
@@ -1882,112 +2131,11 @@ static bool manage_workers(struct worker *worker)
ret |= maybe_create_worker(pool);
pool->flags &= ~POOL_MANAGING_WORKERS;
- mutex_unlock(&pool->manager_mutex);
+ mutex_unlock(&pool->assoc_mutex);
return ret;
}
/**
- * move_linked_works - move linked works to a list
- * @work: start of series of works to be scheduled
- * @head: target list to append @work to
- * @nextp: out paramter for nested worklist walking
- *
- * Schedule linked works starting from @work to @head. Work series to
- * be scheduled starts at @work and includes any consecutive work with
- * WORK_STRUCT_LINKED set in its predecessor.
- *
- * If @nextp is not NULL, it's updated to point to the next work of
- * the last scheduled work. This allows move_linked_works() to be
- * nested inside outer list_for_each_entry_safe().
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void move_linked_works(struct work_struct *work, struct list_head *head,
- struct work_struct **nextp)
-{
- struct work_struct *n;
-
- /*
- * Linked worklist will always end before the end of the list,
- * use NULL for list head.
- */
- list_for_each_entry_safe_from(work, n, NULL, entry) {
- list_move_tail(&work->entry, head);
- if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
- break;
- }
-
- /*
- * If we're already inside safe list traversal and have moved
- * multiple works to the scheduled queue, the next position
- * needs to be updated.
- */
- if (nextp)
- *nextp = n;
-}
-
-static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
-{
- struct work_struct *work = list_first_entry(&cwq->delayed_works,
- struct work_struct, entry);
-
- trace_workqueue_activate_work(work);
- move_linked_works(work, &cwq->pool->worklist, NULL);
- __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
- cwq->nr_active++;
-}
-
-/**
- * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
- * @cwq: cwq of interest
- * @color: color of work which left the queue
- * @delayed: for a delayed work
- *
- * A work either has completed or is removed from pending queue,
- * decrement nr_in_flight of its cwq and handle workqueue flushing.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
- bool delayed)
-{
- /* ignore uncolored works */
- if (color == WORK_NO_COLOR)
- return;
-
- cwq->nr_in_flight[color]--;
-
- if (!delayed) {
- cwq->nr_active--;
- if (!list_empty(&cwq->delayed_works)) {
- /* one down, submit a delayed one */
- if (cwq->nr_active < cwq->max_active)
- cwq_activate_first_delayed(cwq);
- }
- }
-
- /* is flush in progress and are we at the flushing tip? */
- if (likely(cwq->flush_color != color))
- return;
-
- /* are there still in-flight works? */
- if (cwq->nr_in_flight[color])
- return;
-
- /* this cwq is done, clear flush_color */
- cwq->flush_color = -1;
-
- /*
- * If this was the last cwq, wake up the first flusher. It
- * will handle the rest.
- */
- if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
- complete(&cwq->wq->first_flusher->done);
-}
-
-/**
* process_one_work - process single work
* @worker: self
* @work: work to process
@@ -2030,7 +2178,7 @@ __acquires(&gcwq->lock)
* necessary to avoid spurious warnings from rescuers servicing the
* unbound or a disassociated gcwq.
*/
- WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
+ WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
!(gcwq->flags & GCWQ_DISASSOCIATED) &&
raw_smp_processor_id() != gcwq->cpu);
@@ -2046,15 +2194,13 @@ __acquires(&gcwq->lock)
return;
}
- /* claim and process */
+ /* claim and dequeue */
debug_work_deactivate(work);
hlist_add_head(&worker->hentry, bwh);
worker->current_work = work;
worker->current_cwq = cwq;
work_color = get_work_color(work);
- /* record the current cpu number in the work data and dequeue */
- set_work_cpu(work, gcwq->cpu);
list_del_init(&work->entry);
/*
@@ -2071,9 +2217,16 @@ __acquires(&gcwq->lock)
if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
wake_up_worker(pool);
+ /*
+ * Record the last CPU and clear PENDING which should be the last
+ * update to @work. Also, do this inside @gcwq->lock so that
+ * PENDING and queued state changes happen together while IRQ is
+ * disabled.
+ */
+ set_work_cpu_and_clear_pending(work, gcwq->cpu);
+
spin_unlock_irq(&gcwq->lock);
- work_clear_pending(work);
lock_map_acquire_read(&cwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
trace_workqueue_execute_start(work);
@@ -2087,11 +2240,9 @@ __acquires(&gcwq->lock)
lock_map_release(&cwq->wq->lockdep_map);
if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
- printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
- "%s/0x%08x/%d\n",
- current->comm, preempt_count(), task_pid_nr(current));
- printk(KERN_ERR " last function: ");
- print_symbol("%s\n", (unsigned long)f);
+ pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
+ " last function: %pf\n",
+ current->comm, preempt_count(), task_pid_nr(current), f);
debug_show_held_locks(current);
dump_stack();
}
@@ -2106,7 +2257,7 @@ __acquires(&gcwq->lock)
hlist_del_init(&worker->hentry);
worker->current_work = NULL;
worker->current_cwq = NULL;
- cwq_dec_nr_in_flight(cwq, work_color, false);
+ cwq_dec_nr_in_flight(cwq, work_color);
}
/**
@@ -2151,18 +2302,17 @@ static int worker_thread(void *__worker)
woke_up:
spin_lock_irq(&gcwq->lock);
- /*
- * DIE can be set only while idle and REBIND set while busy has
- * @worker->rebind_work scheduled. Checking here is enough.
- */
- if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
+ /* we are off idle list if destruction or rebind is requested */
+ if (unlikely(list_empty(&worker->entry))) {
spin_unlock_irq(&gcwq->lock);
+ /* if DIE is set, destruction is requested */
if (worker->flags & WORKER_DIE) {
worker->task->flags &= ~PF_WQ_WORKER;
return 0;
}
+ /* otherwise, rebind */
idle_worker_rebind(worker);
goto woke_up;
}
@@ -2645,8 +2795,8 @@ reflush:
if (++flush_cnt == 10 ||
(flush_cnt % 100 == 0 && flush_cnt <= 1000))
- pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
- wq->name, flush_cnt);
+ pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n",
+ wq->name, flush_cnt);
goto reflush;
}
@@ -2657,8 +2807,7 @@ reflush:
}
EXPORT_SYMBOL_GPL(drain_workqueue);
-static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
- bool wait_executing)
+static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
{
struct worker *worker = NULL;
struct global_cwq *gcwq;
@@ -2680,13 +2829,12 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
cwq = get_work_cwq(work);
if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
goto already_gone;
- } else if (wait_executing) {
+ } else {
worker = find_worker_executing_work(gcwq, work);
if (!worker)
goto already_gone;
cwq = worker->current_cwq;
- } else
- goto already_gone;
+ }
insert_wq_barrier(cwq, barr, work, worker);
spin_unlock_irq(&gcwq->lock);
@@ -2713,15 +2861,8 @@ already_gone:
* flush_work - wait for a work to finish executing the last queueing instance
* @work: the work to flush
*
- * Wait until @work has finished execution. This function considers
- * only the last queueing instance of @work. If @work has been
- * enqueued across different CPUs on a non-reentrant workqueue or on
- * multiple workqueues, @work might still be executing on return on
- * some of the CPUs from earlier queueing.
- *
- * If @work was queued only on a non-reentrant, ordered or unbound
- * workqueue, @work is guaranteed to be idle on return if it hasn't
- * been requeued since flush started.
+ * Wait until @work has finished execution. @work is guaranteed to be idle
+ * on return if it hasn't been requeued since flush started.
*
* RETURNS:
* %true if flush_work() waited for the work to finish execution,
@@ -2734,140 +2875,36 @@ bool flush_work(struct work_struct *work)
lock_map_acquire(&work->lockdep_map);
lock_map_release(&work->lockdep_map);
- if (start_flush_work(work, &barr, true)) {
+ if (start_flush_work(work, &barr)) {
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
return true;
- } else
- return false;
-}
-EXPORT_SYMBOL_GPL(flush_work);
-
-static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
-{
- struct wq_barrier barr;
- struct worker *worker;
-
- spin_lock_irq(&gcwq->lock);
-
- worker = find_worker_executing_work(gcwq, work);
- if (unlikely(worker))
- insert_wq_barrier(worker->current_cwq, &barr, work, worker);
-
- spin_unlock_irq(&gcwq->lock);
-
- if (unlikely(worker)) {
- wait_for_completion(&barr.done);
- destroy_work_on_stack(&barr.work);
- return true;
- } else
+ } else {
return false;
-}
-
-static bool wait_on_work(struct work_struct *work)
-{
- bool ret = false;
- int cpu;
-
- might_sleep();
-
- lock_map_acquire(&work->lockdep_map);
- lock_map_release(&work->lockdep_map);
-
- for_each_gcwq_cpu(cpu)
- ret |= wait_on_cpu_work(get_gcwq(cpu), work);
- return ret;
-}
-
-/**
- * flush_work_sync - wait until a work has finished execution
- * @work: the work to flush
- *
- * Wait until @work has finished execution. On return, it's
- * guaranteed that all queueing instances of @work which happened
- * before this function is called are finished. In other words, if
- * @work hasn't been requeued since this function was called, @work is
- * guaranteed to be idle on return.
- *
- * RETURNS:
- * %true if flush_work_sync() waited for the work to finish execution,
- * %false if it was already idle.
- */
-bool flush_work_sync(struct work_struct *work)
-{
- struct wq_barrier barr;
- bool pending, waited;
-
- /* we'll wait for executions separately, queue barr only if pending */
- pending = start_flush_work(work, &barr, false);
-
- /* wait for executions to finish */
- waited = wait_on_work(work);
-
- /* wait for the pending one */
- if (pending) {
- wait_for_completion(&barr.done);
- destroy_work_on_stack(&barr.work);
}
-
- return pending || waited;
-}
-EXPORT_SYMBOL_GPL(flush_work_sync);
-
-/*
- * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
- * so this work can't be re-armed in any way.
- */
-static int try_to_grab_pending(struct work_struct *work)
-{
- struct global_cwq *gcwq;
- int ret = -1;
-
- if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
- return 0;
-
- /*
- * The queueing is in progress, or it is already queued. Try to
- * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
- */
- gcwq = get_work_gcwq(work);
- if (!gcwq)
- return ret;
-
- spin_lock_irq(&gcwq->lock);
- if (!list_empty(&work->entry)) {
- /*
- * This work is queued, but perhaps we locked the wrong gcwq.
- * In that case we must see the new value after rmb(), see
- * insert_work()->wmb().
- */
- smp_rmb();
- if (gcwq == get_work_gcwq(work)) {
- debug_work_deactivate(work);
- list_del_init(&work->entry);
- cwq_dec_nr_in_flight(get_work_cwq(work),
- get_work_color(work),
- *work_data_bits(work) & WORK_STRUCT_DELAYED);
- ret = 1;
- }
- }
- spin_unlock_irq(&gcwq->lock);
-
- return ret;
}
+EXPORT_SYMBOL_GPL(flush_work);
-static bool __cancel_work_timer(struct work_struct *work,
- struct timer_list* timer)
+static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
{
+ unsigned long flags;
int ret;
do {
- ret = (timer && likely(del_timer(timer)));
- if (!ret)
- ret = try_to_grab_pending(work);
- wait_on_work(work);
+ ret = try_to_grab_pending(work, is_dwork, &flags);
+ /*
+ * If someone else is canceling, wait for the same event it
+ * would be waiting for before retrying.
+ */
+ if (unlikely(ret == -ENOENT))
+ flush_work(work);
} while (unlikely(ret < 0));
+ /* tell other tasks trying to grab @work to back off */
+ mark_work_canceling(work);
+ local_irq_restore(flags);
+
+ flush_work(work);
clear_work_data(work);
return ret;
}
@@ -2892,7 +2929,7 @@ static bool __cancel_work_timer(struct work_struct *work,
*/
bool cancel_work_sync(struct work_struct *work)
{
- return __cancel_work_timer(work, NULL);
+ return __cancel_work_timer(work, false);
}
EXPORT_SYMBOL_GPL(cancel_work_sync);
@@ -2910,33 +2947,44 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
*/
bool flush_delayed_work(struct delayed_work *dwork)
{
+ local_irq_disable();
if (del_timer_sync(&dwork->timer))
- __queue_work(raw_smp_processor_id(),
+ __queue_work(dwork->cpu,
get_work_cwq(&dwork->work)->wq, &dwork->work);
+ local_irq_enable();
return flush_work(&dwork->work);
}
EXPORT_SYMBOL(flush_delayed_work);
/**
- * flush_delayed_work_sync - wait for a dwork to finish
- * @dwork: the delayed work to flush
+ * cancel_delayed_work - cancel a delayed work
+ * @dwork: delayed_work to cancel
*
- * Delayed timer is cancelled and the pending work is queued for
- * execution immediately. Other than timer handling, its behavior
- * is identical to flush_work_sync().
+ * Kill off a pending delayed_work. Returns %true if @dwork was pending
+ * and canceled; %false if wasn't pending. Note that the work callback
+ * function may still be running on return, unless it returns %true and the
+ * work doesn't re-arm itself. Explicitly flush or use
+ * cancel_delayed_work_sync() to wait on it.
*
- * RETURNS:
- * %true if flush_work_sync() waited for the work to finish execution,
- * %false if it was already idle.
+ * This function is safe to call from any context including IRQ handler.
*/
-bool flush_delayed_work_sync(struct delayed_work *dwork)
+bool cancel_delayed_work(struct delayed_work *dwork)
{
- if (del_timer_sync(&dwork->timer))
- __queue_work(raw_smp_processor_id(),
- get_work_cwq(&dwork->work)->wq, &dwork->work);
- return flush_work_sync(&dwork->work);
+ unsigned long flags;
+ int ret;
+
+ do {
+ ret = try_to_grab_pending(&dwork->work, true, &flags);
+ } while (unlikely(ret == -EAGAIN));
+
+ if (unlikely(ret < 0))
+ return false;
+
+ set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
+ local_irq_restore(flags);
+ return true;
}
-EXPORT_SYMBOL(flush_delayed_work_sync);
+EXPORT_SYMBOL(cancel_delayed_work);
/**
* cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
@@ -2949,54 +2997,39 @@ EXPORT_SYMBOL(flush_delayed_work_sync);
*/
bool cancel_delayed_work_sync(struct delayed_work *dwork)
{
- return __cancel_work_timer(&dwork->work, &dwork->timer);
+ return __cancel_work_timer(&dwork->work, true);
}
EXPORT_SYMBOL(cancel_delayed_work_sync);
/**
- * schedule_work - put work task in global workqueue
- * @work: job to be done
- *
- * Returns zero if @work was already on the kernel-global workqueue and
- * non-zero otherwise.
- *
- * This puts a job in the kernel-global workqueue if it was not already
- * queued and leaves it in the same position on the kernel-global
- * workqueue otherwise.
- */
-int schedule_work(struct work_struct *work)
-{
- return queue_work(system_wq, work);
-}
-EXPORT_SYMBOL(schedule_work);
-
-/*
* schedule_work_on - put work task on a specific cpu
* @cpu: cpu to put the work task on
* @work: job to be done
*
* This puts a job on a specific cpu
*/
-int schedule_work_on(int cpu, struct work_struct *work)
+bool schedule_work_on(int cpu, struct work_struct *work)
{
return queue_work_on(cpu, system_wq, work);
}
EXPORT_SYMBOL(schedule_work_on);
/**
- * schedule_delayed_work - put work task in global workqueue after delay
- * @dwork: job to be done
- * @delay: number of jiffies to wait or 0 for immediate execution
+ * schedule_work - put work task in global workqueue
+ * @work: job to be done
*
- * After waiting for a given time this puts a job in the kernel-global
- * workqueue.
+ * Returns %false if @work was already on the kernel-global workqueue and
+ * %true otherwise.
+ *
+ * This puts a job in the kernel-global workqueue if it was not already
+ * queued and leaves it in the same position on the kernel-global
+ * workqueue otherwise.
*/
-int schedule_delayed_work(struct delayed_work *dwork,
- unsigned long delay)
+bool schedule_work(struct work_struct *work)
{
- return queue_delayed_work(system_wq, dwork, delay);
+ return queue_work(system_wq, work);
}
-EXPORT_SYMBOL(schedule_delayed_work);
+EXPORT_SYMBOL(schedule_work);
/**
* schedule_delayed_work_on - queue work in global workqueue on CPU after delay
@@ -3007,14 +3040,28 @@ EXPORT_SYMBOL(schedule_delayed_work);
* After waiting for a given time this puts a job in the kernel-global
* workqueue on the specified CPU.
*/
-int schedule_delayed_work_on(int cpu,
- struct delayed_work *dwork, unsigned long delay)
+bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
+ unsigned long delay)
{
return queue_delayed_work_on(cpu, system_wq, dwork, delay);
}
EXPORT_SYMBOL(schedule_delayed_work_on);
/**
+ * schedule_delayed_work - put work task in global workqueue after delay
+ * @dwork: job to be done
+ * @delay: number of jiffies to wait or 0 for immediate execution
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue.
+ */
+bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
+{
+ return queue_delayed_work(system_wq, dwork, delay);
+}
+EXPORT_SYMBOL(schedule_delayed_work);
+
+/**
* schedule_on_each_cpu - execute a function synchronously on each online CPU
* @func: the function to call
*
@@ -3161,9 +3208,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
if (max_active < 1 || max_active > lim)
- printk(KERN_WARNING "workqueue: max_active %d requested for %s "
- "is out of range, clamping between %d and %d\n",
- max_active, name, 1, lim);
+ pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
+ max_active, name, 1, lim);
return clamp_val(max_active, 1, lim);
}
@@ -3319,6 +3365,26 @@ void destroy_workqueue(struct workqueue_struct *wq)
EXPORT_SYMBOL_GPL(destroy_workqueue);
/**
+ * cwq_set_max_active - adjust max_active of a cwq
+ * @cwq: target cpu_workqueue_struct
+ * @max_active: new max_active value.
+ *
+ * Set @cwq->max_active to @max_active and activate delayed works if
+ * increased.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active)
+{
+ cwq->max_active = max_active;
+
+ while (!list_empty(&cwq->delayed_works) &&
+ cwq->nr_active < cwq->max_active)
+ cwq_activate_first_delayed(cwq);
+}
+
+/**
* workqueue_set_max_active - adjust max_active of a workqueue
* @wq: target workqueue
* @max_active: new max_active value.
@@ -3345,7 +3411,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
if (!(wq->flags & WQ_FREEZABLE) ||
!(gcwq->flags & GCWQ_FREEZING))
- get_cwq(gcwq->cpu, wq)->max_active = max_active;
+ cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active);
spin_unlock_irq(&gcwq->lock);
}
@@ -3440,23 +3506,23 @@ EXPORT_SYMBOL_GPL(work_busy);
*/
/* claim manager positions of all pools */
-static void gcwq_claim_management_and_lock(struct global_cwq *gcwq)
+static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
{
struct worker_pool *pool;
for_each_worker_pool(pool, gcwq)
- mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
+ mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
spin_lock_irq(&gcwq->lock);
}
/* release manager positions */
-static void gcwq_release_management_and_unlock(struct global_cwq *gcwq)
+static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
{
struct worker_pool *pool;
spin_unlock_irq(&gcwq->lock);
for_each_worker_pool(pool, gcwq)
- mutex_unlock(&pool->manager_mutex);
+ mutex_unlock(&pool->assoc_mutex);
}
static void gcwq_unbind_fn(struct work_struct *work)
@@ -3469,7 +3535,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
BUG_ON(gcwq->cpu != smp_processor_id());
- gcwq_claim_management_and_lock(gcwq);
+ gcwq_claim_assoc_and_lock(gcwq);
/*
* We've claimed all manager positions. Make all workers unbound
@@ -3486,7 +3552,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
gcwq->flags |= GCWQ_DISASSOCIATED;
- gcwq_release_management_and_unlock(gcwq);
+ gcwq_release_assoc_and_unlock(gcwq);
/*
* Call schedule() so that we cross rq->lock and thus can guarantee
@@ -3514,7 +3580,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
* Workqueues should be brought up before normal priority CPU notifiers.
* This will be registered high priority CPU notifier.
*/
-static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
@@ -3542,10 +3608,10 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
case CPU_DOWN_FAILED:
case CPU_ONLINE:
- gcwq_claim_management_and_lock(gcwq);
+ gcwq_claim_assoc_and_lock(gcwq);
gcwq->flags &= ~GCWQ_DISASSOCIATED;
rebind_workers(gcwq);
- gcwq_release_management_and_unlock(gcwq);
+ gcwq_release_assoc_and_unlock(gcwq);
break;
}
return NOTIFY_OK;
@@ -3555,7 +3621,7 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
* Workqueues should be brought down after normal priority CPU notifiers.
* This will be registered as low priority CPU notifier.
*/
-static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
@@ -3566,7 +3632,7 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
case CPU_DOWN_PREPARE:
/* unbinding should happen on the local CPU */
INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
- schedule_work_on(cpu, &unbind_work);
+ queue_work_on(cpu, system_highpri_wq, &unbind_work);
flush_work(&unbind_work);
break;
}
@@ -3735,11 +3801,7 @@ void thaw_workqueues(void)
continue;
/* restore max_active and repopulate worklist */
- cwq->max_active = wq->saved_max_active;
-
- while (!list_empty(&cwq->delayed_works) &&
- cwq->nr_active < cwq->max_active)
- cwq_activate_first_delayed(cwq);
+ cwq_set_max_active(cwq, wq->saved_max_active);
}
for_each_worker_pool(pool, gcwq)
@@ -3759,8 +3821,12 @@ static int __init init_workqueues(void)
unsigned int cpu;
int i;
+ /* make sure we have enough bits for OFFQ CPU number */
+ BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) <
+ WORK_CPU_LAST);
+
cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
- cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
+ hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
/* initialize gcwqs */
for_each_gcwq_cpu(cpu) {
@@ -3786,11 +3852,9 @@ static int __init init_workqueues(void)
setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
(unsigned long)pool);
- mutex_init(&pool->manager_mutex);
+ mutex_init(&pool->assoc_mutex);
ida_init(&pool->worker_ida);
}
-
- init_waitqueue_head(&gcwq->rebind_hold);
}
/* create the initial worker */
@@ -3813,17 +3877,14 @@ static int __init init_workqueues(void)
}
system_wq = alloc_workqueue("events", 0, 0);
+ system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
- system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
WQ_UNBOUND_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
- system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable",
- WQ_NON_REENTRANT | WQ_FREEZABLE, 0);
- BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
- !system_unbound_wq || !system_freezable_wq ||
- !system_nrt_freezable_wq);
+ BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
+ !system_unbound_wq || !system_freezable_wq);
return 0;
}
early_initcall(init_workqueues);