diff options
Diffstat (limited to 'kernel')
74 files changed, 3795 insertions, 2476 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 2251882daf53..44511d100eaa 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -87,6 +87,9 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQ config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE bool +config UNINLINE_SPIN_UNLOCK + bool + # # lock_* functions are inlined when: # - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y @@ -103,100 +106,120 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y # +if !DEBUG_SPINLOCK + config INLINE_SPIN_TRYLOCK - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK + def_bool y + depends on ARCH_INLINE_SPIN_TRYLOCK config INLINE_SPIN_TRYLOCK_BH - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH + def_bool y + depends on ARCH_INLINE_SPIN_TRYLOCK_BH config INLINE_SPIN_LOCK - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK config INLINE_SPIN_LOCK_BH - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_SPIN_LOCK_BH + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH config INLINE_SPIN_LOCK_IRQ - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_SPIN_LOCK_IRQ + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ config INLINE_SPIN_LOCK_IRQSAVE - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_SPIN_LOCK_IRQSAVE - -config UNINLINE_SPIN_UNLOCK - bool + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE config INLINE_SPIN_UNLOCK_BH - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH + def_bool y + depends on ARCH_INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQ - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH) + def_bool y + depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQRESTORE - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE + def_bool y + depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE config INLINE_READ_TRYLOCK - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK + def_bool y + depends on ARCH_INLINE_READ_TRYLOCK config INLINE_READ_LOCK - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK config INLINE_READ_LOCK_BH - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_READ_LOCK_BH + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH config INLINE_READ_LOCK_IRQ - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_READ_LOCK_IRQ + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ config INLINE_READ_LOCK_IRQSAVE - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_READ_LOCK_IRQSAVE + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE config INLINE_READ_UNLOCK - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK) + def_bool y + depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK config INLINE_READ_UNLOCK_BH - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH + def_bool y + depends on ARCH_INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQ - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH) + def_bool y + depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQRESTORE - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE + def_bool y + depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE config INLINE_WRITE_TRYLOCK - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK + def_bool y + depends on ARCH_INLINE_WRITE_TRYLOCK config INLINE_WRITE_LOCK - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK config INLINE_WRITE_LOCK_BH - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_WRITE_LOCK_BH + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH config INLINE_WRITE_LOCK_IRQ - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_WRITE_LOCK_IRQ + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ config INLINE_WRITE_LOCK_IRQSAVE - def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ - ARCH_INLINE_WRITE_LOCK_IRQSAVE + def_bool y + depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE config INLINE_WRITE_UNLOCK - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK) + def_bool y + depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK config INLINE_WRITE_UNLOCK_BH - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH + def_bool y + depends on ARCH_INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQ - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH) + def_bool y + depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQRESTORE - def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + def_bool y + depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + +endif config MUTEX_SPIN_ON_OWNER - def_bool SMP && !DEBUG_MUTEXES + def_bool y + depends on SMP && !DEBUG_MUTEXES diff --git a/kernel/Makefile b/kernel/Makefile index e5602d32acb3..5404911eaee9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -97,7 +97,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ -obj-$(CONFIG_X86_DS) += trace/ +obj-$(CONFIG_TRACE_CLOCK) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o diff --git a/kernel/acct.c b/kernel/acct.c index 02e6167a53b0..6cd7529c9e6a 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -507,8 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct, do_div(elapsed, AHZ); ac.ac_btime = get_seconds() - elapsed; /* we really need to bite the bullet and change layout */ - ac.ac_uid = orig_cred->uid; - ac.ac_gid = orig_cred->gid; + ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); + ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); #if ACCT_VERSION==2 ac.ac_ahz = AHZ; #endif diff --git a/kernel/audit.c b/kernel/audit.c index ea3b7b6191c7..40414e9143db 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -61,6 +61,7 @@ #include <linux/netlink.h> #include <linux/freezer.h> #include <linux/tty.h> +#include <linux/pid_namespace.h> #include "audit.h" @@ -87,11 +88,11 @@ static int audit_failure = AUDIT_FAIL_PRINTK; /* * If audit records are to be written to the netlink socket, audit_pid - * contains the pid of the auditd process and audit_nlk_pid contains - * the pid to use to send netlink messages to that process. + * contains the pid of the auditd process and audit_nlk_portid contains + * the portid to use to send netlink messages to that process. */ int audit_pid; -static int audit_nlk_pid; +static int audit_nlk_portid; /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in @@ -104,7 +105,7 @@ static int audit_backlog_wait_time = 60 * HZ; static int audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ -uid_t audit_sig_uid = -1; +kuid_t audit_sig_uid = INVALID_UID; pid_t audit_sig_pid = -1; u32 audit_sig_sid = 0; @@ -264,7 +265,7 @@ void audit_log_lost(const char *message) } static int audit_log_config_change(char *function_name, int new, int old, - uid_t loginuid, u32 sessionid, u32 sid, + kuid_t loginuid, u32 sessionid, u32 sid, int allow_changes) { struct audit_buffer *ab; @@ -272,7 +273,7 @@ static int audit_log_config_change(char *function_name, int new, int old, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, - old, loginuid, sessionid); + old, from_kuid(&init_user_ns, loginuid), sessionid); if (sid) { char *ctx = NULL; u32 len; @@ -292,7 +293,7 @@ static int audit_log_config_change(char *function_name, int new, int old, } static int audit_do_config_change(char *function_name, int *to_change, - int new, uid_t loginuid, u32 sessionid, + int new, kuid_t loginuid, u32 sessionid, u32 sid) { int allow_changes, rc = 0, old = *to_change; @@ -319,21 +320,21 @@ static int audit_do_config_change(char *function_name, int *to_change, return rc; } -static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, +static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid, u32 sid) { return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit, loginuid, sessionid, sid); } -static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, +static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid, u32 sid) { return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit, loginuid, sessionid, sid); } -static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid) { int rc; if (state < AUDIT_OFF || state > AUDIT_LOCKED) @@ -348,7 +349,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) return rc; } -static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK @@ -401,7 +402,7 @@ static void kauditd_send_skb(struct sk_buff *skb) int err; /* take a reference in case we can't send it and we want to hold it */ skb_get(skb); - err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); + err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); if (err < 0) { BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); @@ -467,24 +468,6 @@ static int kauditd_thread(void *dummy) return 0; } -static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) -{ - struct task_struct *tsk; - int err; - - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (!tsk) { - rcu_read_unlock(); - return -ESRCH; - } - get_task_struct(tsk); - rcu_read_unlock(); - err = tty_audit_push_task(tsk, loginuid, sessionid); - put_task_struct(tsk); - return err; -} - int audit_send_list(void *_dest) { struct audit_netlink_list *dest = _dest; @@ -588,6 +571,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) { int err = 0; + /* Only support the initial namespaces for now. */ + if ((current_user_ns() != &init_user_ns) || + (task_active_pid_ns(current) != &init_pid_ns)) + return -EPERM; + switch (msg_type) { case AUDIT_GET: case AUDIT_LIST: @@ -619,8 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) } static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, - u32 pid, u32 uid, uid_t auid, u32 ses, - u32 sid) + kuid_t auid, u32 ses, u32 sid) { int rc = 0; char *ctx = NULL; @@ -633,7 +620,9 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", - pid, uid, auid, ses); + task_tgid_vnr(current), + from_kuid(&init_user_ns, current_uid()), + from_kuid(&init_user_ns, auid), ses); if (sid) { rc = security_secid_to_secctx(sid, &ctx, &len); if (rc) @@ -649,13 +638,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { - u32 uid, pid, seq, sid; + u32 seq, sid; void *data; struct audit_status *status_get, status_set; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; - uid_t loginuid; /* loginuid of sender */ + kuid_t loginuid; /* loginuid of sender */ u32 sessionid; struct audit_sig_info *sig_data; char *ctx = NULL; @@ -675,8 +664,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } - pid = NETLINK_CREDS(skb)->pid; - uid = NETLINK_CREDS(skb)->uid; loginuid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); security_task_getsecid(current, &sid); @@ -692,7 +679,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) status_set.backlog_limit = audit_backlog_limit; status_set.lost = atomic_read(&audit_lost); status_set.backlog = skb_queue_len(&audit_skb_queue); - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, + audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, &status_set, sizeof(status_set)); break; case AUDIT_SET: @@ -720,7 +707,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sessionid, sid, 1); audit_pid = new_pid; - audit_nlk_pid = NETLINK_CB(skb).pid; + audit_nlk_portid = NETLINK_CB(skb).portid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(status_get->rate_limit, @@ -738,16 +725,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(&NETLINK_CB(skb)); + err = audit_filter_user(); if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { - err = audit_prepare_user_tty(pid, loginuid, + err = tty_audit_push_task(current, loginuid, sessionid); if (err) break; } - audit_log_common_recv_msg(&ab, msg_type, pid, uid, + audit_log_common_recv_msg(&ab, msg_type, loginuid, sessionid, sid); if (msg_type != AUDIT_USER_TTY) @@ -763,7 +750,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) size--; audit_log_n_untrustedstring(ab, data, size); } - audit_set_pid(ab, pid); + audit_set_pid(ab, NETLINK_CB(skb).portid); audit_log_end(ab); } break; @@ -772,8 +759,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(struct audit_rule)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, + loginuid, sessionid, sid); audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); @@ -782,8 +769,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, - uid, seq, data, nlmsg_len(nlh), + err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, + seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; case AUDIT_ADD_RULE: @@ -791,8 +778,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, + loginuid, sessionid, sid); audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); @@ -801,15 +788,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST_RULES: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, - uid, seq, data, nlmsg_len(nlh), + err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, + seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; case AUDIT_TRIM: audit_trim_trees(); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, + loginuid, sessionid, sid); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); @@ -840,8 +827,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* OK, here comes... */ err = audit_tag_tree(old, new); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, + loginuid, sessionid, sid); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); @@ -866,53 +853,41 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) security_release_secctx(ctx, len); return -ENOMEM; } - sig_data->uid = audit_sig_uid; + sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); sig_data->pid = audit_sig_pid; if (audit_sig_sid) { memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, + audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); break; case AUDIT_TTY_GET: { struct audit_tty_status s; - struct task_struct *tsk; - unsigned long flags; - - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk && lock_task_sighand(tsk, &flags)) { - s.enabled = tsk->signal->audit_tty != 0; - unlock_task_sighand(tsk, &flags); - } else - err = -ESRCH; - rcu_read_unlock(); - - if (!err) - audit_send_reply(NETLINK_CB(skb).pid, seq, - AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); + struct task_struct *tsk = current; + + spin_lock_irq(&tsk->sighand->siglock); + s.enabled = tsk->signal->audit_tty != 0; + spin_unlock_irq(&tsk->sighand->siglock); + + audit_send_reply(NETLINK_CB(skb).portid, seq, + AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status *s; - struct task_struct *tsk; - unsigned long flags; + struct task_struct *tsk = current; if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) return -EINVAL; s = data; if (s->enabled != 0 && s->enabled != 1) return -EINVAL; - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk && lock_task_sighand(tsk, &flags)) { - tsk->signal->audit_tty = s->enabled != 0; - unlock_task_sighand(tsk, &flags); - } else - err = -ESRCH; - rcu_read_unlock(); + + spin_lock_irq(&tsk->sighand->siglock); + tsk->signal->audit_tty = s->enabled != 0; + spin_unlock_irq(&tsk->sighand->siglock); break; } default: @@ -971,8 +946,7 @@ static int __init audit_init(void) printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, - THIS_MODULE, &cfg); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg); if (!audit_sock) audit_panic("cannot initialize netlink socket"); else @@ -1466,6 +1440,8 @@ void audit_log_link_denied(const char *operation, struct path *link) ab = audit_log_start(current->audit_context, GFP_KERNEL, AUDIT_ANOM_LINK); + if (!ab) + return; audit_log_format(ab, "op=%s action=denied", operation); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); diff --git a/kernel/audit.h b/kernel/audit.h index 816766803371..9eb3d79482b6 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -76,6 +76,8 @@ static inline int audit_hash_ino(u32 ino) extern int audit_match_class(int class, unsigned syscall); extern int audit_comparator(const u32 left, const u32 op, const u32 right); +extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); +extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); extern int audit_compare_dname_path(const char *dname, const char *path, int *dirlen); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, @@ -144,7 +146,7 @@ extern void audit_kill_trees(struct list_head *); extern char *audit_unpack_string(void **, size_t *, size_t); extern pid_t audit_sig_pid; -extern uid_t audit_sig_uid; +extern kuid_t audit_sig_uid; extern u32 audit_sig_sid; #ifdef CONFIG_AUDITSYSCALL diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 3823281401b5..1c22ec3d87bc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -241,7 +241,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "auid=%u ses=%u op=", - audit_get_loginuid(current), + from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); audit_log_string(ab, op); audit_log_format(ab, " path="); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a6c3f1abd206..c4bcdbaf4d4d 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -342,6 +342,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); f->val = rule->values[i]; + f->uid = INVALID_UID; + f->gid = INVALID_GID; err = -EINVAL; if (f->op == Audit_bad) @@ -350,16 +352,32 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) switch(f->type) { default: goto exit_free; - case AUDIT_PID: case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: + case AUDIT_LOGINUID: + /* bit ops not implemented for uid comparisons */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + goto exit_free; + + f->uid = make_kuid(current_user_ns(), f->val); + if (!uid_valid(f->uid)) + goto exit_free; + break; case AUDIT_GID: case AUDIT_EGID: case AUDIT_SGID: case AUDIT_FSGID: - case AUDIT_LOGINUID: + /* bit ops not implemented for gid comparisons */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + goto exit_free; + + f->gid = make_kgid(current_user_ns(), f->val); + if (!gid_valid(f->gid)) + goto exit_free; + break; + case AUDIT_PID: case AUDIT_PERS: case AUDIT_MSGTYPE: case AUDIT_PPID: @@ -437,19 +455,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->type = data->fields[i]; f->val = data->values[i]; + f->uid = INVALID_UID; + f->gid = INVALID_GID; f->lsm_str = NULL; f->lsm_rule = NULL; switch(f->type) { - case AUDIT_PID: case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: + case AUDIT_LOGINUID: + case AUDIT_OBJ_UID: + /* bit ops not implemented for uid comparisons */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + goto exit_free; + + f->uid = make_kuid(current_user_ns(), f->val); + if (!uid_valid(f->uid)) + goto exit_free; + break; case AUDIT_GID: case AUDIT_EGID: case AUDIT_SGID: case AUDIT_FSGID: - case AUDIT_LOGINUID: + case AUDIT_OBJ_GID: + /* bit ops not implemented for gid comparisons */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + goto exit_free; + + f->gid = make_kgid(current_user_ns(), f->val); + if (!gid_valid(f->gid)) + goto exit_free; + break; + case AUDIT_PID: case AUDIT_PERS: case AUDIT_MSGTYPE: case AUDIT_PPID: @@ -461,8 +499,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: - case AUDIT_OBJ_UID: - case AUDIT_OBJ_GID: break; case AUDIT_ARCH: entry->rule.arch_f = f; @@ -707,6 +743,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) if (strcmp(a->filterkey, b->filterkey)) return 1; break; + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_LOGINUID: + case AUDIT_OBJ_UID: + if (!uid_eq(a->fields[i].uid, b->fields[i].uid)) + return 1; + break; + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_OBJ_GID: + if (!gid_eq(a->fields[i].gid, b->fields[i].gid)) + return 1; + break; default: if (a->fields[i].val != b->fields[i].val) return 1; @@ -1056,7 +1109,7 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) } /* Log rule additions and removals */ -static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, +static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, char *action, struct audit_krule *rule, int res) { @@ -1068,7 +1121,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; - audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); + audit_log_format(ab, "auid=%u ses=%u", + from_kuid(&init_user_ns, loginuid), sessionid); if (sid) { char *ctx = NULL; u32 len; @@ -1098,8 +1152,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, * @sessionid: sessionid for netlink audit message * @sid: SE Linux Security ID of sender */ -int audit_receive_filter(int type, int pid, int uid, int seq, void *data, - size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) +int audit_receive_filter(int type, int pid, int seq, void *data, + size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid) { struct task_struct *tsk; struct audit_netlink_list *dest; @@ -1198,6 +1252,52 @@ int audit_comparator(u32 left, u32 op, u32 right) } } +int audit_uid_comparator(kuid_t left, u32 op, kuid_t right) +{ + switch (op) { + case Audit_equal: + return uid_eq(left, right); + case Audit_not_equal: + return !uid_eq(left, right); + case Audit_lt: + return uid_lt(left, right); + case Audit_le: + return uid_lte(left, right); + case Audit_gt: + return uid_gt(left, right); + case Audit_ge: + return uid_gte(left, right); + case Audit_bitmask: + case Audit_bittest: + default: + BUG(); + return 0; + } +} + +int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) +{ + switch (op) { + case Audit_equal: + return gid_eq(left, right); + case Audit_not_equal: + return !gid_eq(left, right); + case Audit_lt: + return gid_lt(left, right); + case Audit_le: + return gid_lte(left, right); + case Audit_gt: + return gid_gt(left, right); + case Audit_ge: + return gid_gte(left, right); + case Audit_bitmask: + case Audit_bittest: + default: + BUG(); + return 0; + } +} + /* Compare given dentry name with last component in given path, * return of 0 indicates a match. */ int audit_compare_dname_path(const char *dname, const char *path, @@ -1236,8 +1336,7 @@ int audit_compare_dname_path(const char *dname, const char *path, return strncmp(p, dname, dlen); } -static int audit_filter_user_rules(struct netlink_skb_parms *cb, - struct audit_krule *rule, +static int audit_filter_user_rules(struct audit_krule *rule, enum audit_state *state) { int i; @@ -1249,17 +1348,17 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, switch (f->type) { case AUDIT_PID: - result = audit_comparator(cb->creds.pid, f->op, f->val); + result = audit_comparator(task_pid_vnr(current), f->op, f->val); break; case AUDIT_UID: - result = audit_comparator(cb->creds.uid, f->op, f->val); + result = audit_uid_comparator(current_uid(), f->op, f->uid); break; case AUDIT_GID: - result = audit_comparator(cb->creds.gid, f->op, f->val); + result = audit_gid_comparator(current_gid(), f->op, f->gid); break; case AUDIT_LOGINUID: - result = audit_comparator(audit_get_loginuid(current), - f->op, f->val); + result = audit_uid_comparator(audit_get_loginuid(current), + f->op, f->uid); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: @@ -1287,7 +1386,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, return 1; } -int audit_filter_user(struct netlink_skb_parms *cb) +int audit_filter_user(void) { enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; @@ -1295,7 +1394,7 @@ int audit_filter_user(struct netlink_skb_parms *cb) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - if (audit_filter_user_rules(cb, &e->rule, &state)) { + if (audit_filter_user_rules(&e->rule, &state)) { if (state == AUDIT_DISABLED) ret = 0; break; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4b96415527b8..f4a7756f999c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -113,8 +113,8 @@ struct audit_names { unsigned long ino; dev_t dev; umode_t mode; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; dev_t rdev; u32 osid; struct audit_cap_data fcap; @@ -149,8 +149,8 @@ struct audit_aux_data_execve { struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; - uid_t target_auid[AUDIT_AUX_PIDS]; - uid_t target_uid[AUDIT_AUX_PIDS]; + kuid_t target_auid[AUDIT_AUX_PIDS]; + kuid_t target_uid[AUDIT_AUX_PIDS]; unsigned int target_sessionid[AUDIT_AUX_PIDS]; u32 target_sid[AUDIT_AUX_PIDS]; char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; @@ -208,14 +208,14 @@ struct audit_context { size_t sockaddr_len; /* Save things to print about task_struct */ pid_t pid, ppid; - uid_t uid, euid, suid, fsuid; - gid_t gid, egid, sgid, fsgid; + kuid_t uid, euid, suid, fsuid; + kgid_t gid, egid, sgid, fsgid; unsigned long personality; int arch; pid_t target_pid; - uid_t target_auid; - uid_t target_uid; + kuid_t target_auid; + kuid_t target_uid; unsigned int target_sessionid; u32 target_sid; char target_comm[TASK_COMM_LEN]; @@ -231,8 +231,8 @@ struct audit_context { long args[6]; } socketcall; struct { - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; umode_t mode; u32 osid; int has_perm; @@ -464,37 +464,47 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) return 0; } -static int audit_compare_id(uid_t uid1, - struct audit_names *name, - unsigned long name_offset, - struct audit_field *f, - struct audit_context *ctx) +static int audit_compare_uid(kuid_t uid, + struct audit_names *name, + struct audit_field *f, + struct audit_context *ctx) { struct audit_names *n; - unsigned long addr; - uid_t uid2; int rc; - - BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); - + if (name) { - addr = (unsigned long)name; - addr += name_offset; - - uid2 = *(uid_t *)addr; - rc = audit_comparator(uid1, f->op, uid2); + rc = audit_uid_comparator(uid, f->op, name->uid); if (rc) return rc; } - + if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - addr = (unsigned long)n; - addr += name_offset; - - uid2 = *(uid_t *)addr; + rc = audit_uid_comparator(uid, f->op, n->uid); + if (rc) + return rc; + } + } + return 0; +} - rc = audit_comparator(uid1, f->op, uid2); +static int audit_compare_gid(kgid_t gid, + struct audit_names *name, + struct audit_field *f, + struct audit_context *ctx) +{ + struct audit_names *n; + int rc; + + if (name) { + rc = audit_gid_comparator(gid, f->op, name->gid); + if (rc) + return rc; + } + + if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + rc = audit_gid_comparator(gid, f->op, n->gid); if (rc) return rc; } @@ -511,80 +521,62 @@ static int audit_field_compare(struct task_struct *tsk, switch (f->val) { /* process to file object comparisons */ case AUDIT_COMPARE_UID_TO_OBJ_UID: - return audit_compare_id(cred->uid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(cred->uid, name, f, ctx); case AUDIT_COMPARE_GID_TO_OBJ_GID: - return audit_compare_id(cred->gid, - name, offsetof(struct audit_names, gid), - f, ctx); + return audit_compare_gid(cred->gid, name, f, ctx); case AUDIT_COMPARE_EUID_TO_OBJ_UID: - return audit_compare_id(cred->euid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(cred->euid, name, f, ctx); case AUDIT_COMPARE_EGID_TO_OBJ_GID: - return audit_compare_id(cred->egid, - name, offsetof(struct audit_names, gid), - f, ctx); + return audit_compare_gid(cred->egid, name, f, ctx); case AUDIT_COMPARE_AUID_TO_OBJ_UID: - return audit_compare_id(tsk->loginuid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(tsk->loginuid, name, f, ctx); case AUDIT_COMPARE_SUID_TO_OBJ_UID: - return audit_compare_id(cred->suid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(cred->suid, name, f, ctx); case AUDIT_COMPARE_SGID_TO_OBJ_GID: - return audit_compare_id(cred->sgid, - name, offsetof(struct audit_names, gid), - f, ctx); + return audit_compare_gid(cred->sgid, name, f, ctx); case AUDIT_COMPARE_FSUID_TO_OBJ_UID: - return audit_compare_id(cred->fsuid, - name, offsetof(struct audit_names, uid), - f, ctx); + return audit_compare_uid(cred->fsuid, name, f, ctx); case AUDIT_COMPARE_FSGID_TO_OBJ_GID: - return audit_compare_id(cred->fsgid, - name, offsetof(struct audit_names, gid), - f, ctx); + return audit_compare_gid(cred->fsgid, name, f, ctx); /* uid comparisons */ case AUDIT_COMPARE_UID_TO_AUID: - return audit_comparator(cred->uid, f->op, tsk->loginuid); + return audit_uid_comparator(cred->uid, f->op, tsk->loginuid); case AUDIT_COMPARE_UID_TO_EUID: - return audit_comparator(cred->uid, f->op, cred->euid); + return audit_uid_comparator(cred->uid, f->op, cred->euid); case AUDIT_COMPARE_UID_TO_SUID: - return audit_comparator(cred->uid, f->op, cred->suid); + return audit_uid_comparator(cred->uid, f->op, cred->suid); case AUDIT_COMPARE_UID_TO_FSUID: - return audit_comparator(cred->uid, f->op, cred->fsuid); + return audit_uid_comparator(cred->uid, f->op, cred->fsuid); /* auid comparisons */ case AUDIT_COMPARE_AUID_TO_EUID: - return audit_comparator(tsk->loginuid, f->op, cred->euid); + return audit_uid_comparator(tsk->loginuid, f->op, cred->euid); case AUDIT_COMPARE_AUID_TO_SUID: - return audit_comparator(tsk->loginuid, f->op, cred->suid); + return audit_uid_comparator(tsk->loginuid, f->op, cred->suid); case AUDIT_COMPARE_AUID_TO_FSUID: - return audit_comparator(tsk->loginuid, f->op, cred->fsuid); + return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid); /* euid comparisons */ case AUDIT_COMPARE_EUID_TO_SUID: - return audit_comparator(cred->euid, f->op, cred->suid); + return audit_uid_comparator(cred->euid, f->op, cred->suid); case AUDIT_COMPARE_EUID_TO_FSUID: - return audit_comparator(cred->euid, f->op, cred->fsuid); + return audit_uid_comparator(cred->euid, f->op, cred->fsuid); /* suid comparisons */ case AUDIT_COMPARE_SUID_TO_FSUID: - return audit_comparator(cred->suid, f->op, cred->fsuid); + return audit_uid_comparator(cred->suid, f->op, cred->fsuid); /* gid comparisons */ case AUDIT_COMPARE_GID_TO_EGID: - return audit_comparator(cred->gid, f->op, cred->egid); + return audit_gid_comparator(cred->gid, f->op, cred->egid); case AUDIT_COMPARE_GID_TO_SGID: - return audit_comparator(cred->gid, f->op, cred->sgid); + return audit_gid_comparator(cred->gid, f->op, cred->sgid); case AUDIT_COMPARE_GID_TO_FSGID: - return audit_comparator(cred->gid, f->op, cred->fsgid); + return audit_gid_comparator(cred->gid, f->op, cred->fsgid); /* egid comparisons */ case AUDIT_COMPARE_EGID_TO_SGID: - return audit_comparator(cred->egid, f->op, cred->sgid); + return audit_gid_comparator(cred->egid, f->op, cred->sgid); case AUDIT_COMPARE_EGID_TO_FSGID: - return audit_comparator(cred->egid, f->op, cred->fsgid); + return audit_gid_comparator(cred->egid, f->op, cred->fsgid); /* sgid comparison */ case AUDIT_COMPARE_SGID_TO_FSGID: - return audit_comparator(cred->sgid, f->op, cred->fsgid); + return audit_gid_comparator(cred->sgid, f->op, cred->fsgid); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; @@ -630,28 +622,28 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_UID: - result = audit_comparator(cred->uid, f->op, f->val); + result = audit_uid_comparator(cred->uid, f->op, f->uid); break; case AUDIT_EUID: - result = audit_comparator(cred->euid, f->op, f->val); + result = audit_uid_comparator(cred->euid, f->op, f->uid); break; case AUDIT_SUID: - result = audit_comparator(cred->suid, f->op, f->val); + result = audit_uid_comparator(cred->suid, f->op, f->uid); break; case AUDIT_FSUID: - result = audit_comparator(cred->fsuid, f->op, f->val); + result = audit_uid_comparator(cred->fsuid, f->op, f->uid); break; case AUDIT_GID: - result = audit_comparator(cred->gid, f->op, f->val); + result = audit_gid_comparator(cred->gid, f->op, f->gid); break; case AUDIT_EGID: - result = audit_comparator(cred->egid, f->op, f->val); + result = audit_gid_comparator(cred->egid, f->op, f->gid); break; case AUDIT_SGID: - result = audit_comparator(cred->sgid, f->op, f->val); + result = audit_gid_comparator(cred->sgid, f->op, f->gid); break; case AUDIT_FSGID: - result = audit_comparator(cred->fsgid, f->op, f->val); + result = audit_gid_comparator(cred->fsgid, f->op, f->gid); break; case AUDIT_PERS: result = audit_comparator(tsk->personality, f->op, f->val); @@ -717,10 +709,10 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_OBJ_UID: if (name) { - result = audit_comparator(name->uid, f->op, f->val); + result = audit_uid_comparator(name->uid, f->op, f->uid); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(n->uid, f->op, f->val)) { + if (audit_uid_comparator(n->uid, f->op, f->uid)) { ++result; break; } @@ -729,10 +721,10 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_OBJ_GID: if (name) { - result = audit_comparator(name->gid, f->op, f->val); + result = audit_gid_comparator(name->gid, f->op, f->gid); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(n->gid, f->op, f->val)) { + if (audit_gid_comparator(n->gid, f->op, f->gid)) { ++result; break; } @@ -750,7 +742,7 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_LOGINUID: result = 0; if (ctx) - result = audit_comparator(tsk->loginuid, f->op, f->val); + result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: @@ -1154,13 +1146,43 @@ error_path: EXPORT_SYMBOL(audit_log_task_context); -static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) +void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { + const struct cred *cred; char name[sizeof(tsk->comm)]; struct mm_struct *mm = tsk->mm; - struct vm_area_struct *vma; + char *tty; + + if (!ab) + return; /* tsk == current */ + cred = current_cred(); + + spin_lock_irq(&tsk->sighand->siglock); + if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) + tty = tsk->signal->tty->name; + else + tty = "(none)"; + spin_unlock_irq(&tsk->sighand->siglock); + + + audit_log_format(ab, + " ppid=%ld pid=%d auid=%u uid=%u gid=%u" + " euid=%u suid=%u fsuid=%u" + " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", + sys_getppid(), + tsk->pid, + from_kuid(&init_user_ns, tsk->loginuid), + from_kuid(&init_user_ns, cred->uid), + from_kgid(&init_user_ns, cred->gid), + from_kuid(&init_user_ns, cred->euid), + from_kuid(&init_user_ns, cred->suid), + from_kuid(&init_user_ns, cred->fsuid), + from_kgid(&init_user_ns, cred->egid), + from_kgid(&init_user_ns, cred->sgid), + from_kgid(&init_user_ns, cred->fsgid), + tsk->sessionid, tty); get_task_comm(name, tsk); audit_log_format(ab, " comm="); @@ -1168,23 +1190,17 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk if (mm) { down_read(&mm->mmap_sem); - vma = mm->mmap; - while (vma) { - if ((vma->vm_flags & VM_EXECUTABLE) && - vma->vm_file) { - audit_log_d_path(ab, " exe=", - &vma->vm_file->f_path); - break; - } - vma = vma->vm_next; - } + if (mm->exe_file) + audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); up_read(&mm->mmap_sem); } audit_log_task_context(ab); } +EXPORT_SYMBOL(audit_log_task_info); + static int audit_log_pid_context(struct audit_context *context, pid_t pid, - uid_t auid, uid_t uid, unsigned int sessionid, + kuid_t auid, kuid_t uid, unsigned int sessionid, u32 sid, char *comm) { struct audit_buffer *ab; @@ -1196,8 +1212,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, if (!ab) return rc; - audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, - uid, sessionid); + audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, + from_kuid(&init_user_ns, auid), + from_kuid(&init_user_ns, uid), sessionid); if (security_secid_to_secctx(sid, &ctx, &len)) { audit_log_format(ab, " obj=(none)"); rc = 1; @@ -1447,7 +1464,9 @@ static void show_special(struct audit_context *context, int *call_panic) u32 osid = context->ipc.osid; audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", - context->ipc.uid, context->ipc.gid, context->ipc.mode); + from_kuid(&init_user_ns, context->ipc.uid), + from_kgid(&init_user_ns, context->ipc.gid), + context->ipc.mode); if (osid) { char *ctx = NULL; u32 len; @@ -1560,8 +1579,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, MAJOR(n->dev), MINOR(n->dev), n->mode, - n->uid, - n->gid, + from_kuid(&init_user_ns, n->uid), + from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); } @@ -1585,26 +1604,12 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { - const struct cred *cred; int i, call_panic = 0; struct audit_buffer *ab; struct audit_aux_data *aux; - const char *tty; struct audit_names *n; /* tsk == current */ - context->pid = tsk->pid; - if (!context->ppid) - context->ppid = sys_getppid(); - cred = current_cred(); - context->uid = cred->uid; - context->gid = cred->gid; - context->euid = cred->euid; - context->suid = cred->suid; - context->fsuid = cred->fsuid; - context->egid = cred->egid; - context->sgid = cred->sgid; - context->fsgid = cred->fsgid; context->personality = tsk->personality; ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); @@ -1619,32 +1624,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); - spin_lock_irq(&tsk->sighand->siglock); - if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) - tty = tsk->signal->tty->name; - else - tty = "(none)"; - spin_unlock_irq(&tsk->sighand->siglock); - audit_log_format(ab, - " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" - " ppid=%d pid=%d auid=%u uid=%u gid=%u" - " euid=%u suid=%u fsuid=%u" - " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", - context->argv[0], - context->argv[1], - context->argv[2], - context->argv[3], - context->name_count, - context->ppid, - context->pid, - tsk->loginuid, - context->uid, - context->gid, - context->euid, context->suid, context->fsuid, - context->egid, context->sgid, context->fsgid, tty, - tsk->sessionid); - + " a0=%lx a1=%lx a2=%lx a3=%lx items=%d", + context->argv[0], + context->argv[1], + context->argv[2], + context->argv[3], + context->name_count); audit_log_task_info(ab, tsk); audit_log_key(ab, context->filterkey); @@ -2299,14 +2285,14 @@ static atomic_t session_id = ATOMIC_INIT(0); * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ -int audit_set_loginuid(uid_t loginuid) +int audit_set_loginuid(kuid_t loginuid) { struct task_struct *task = current; struct audit_context *context = task->audit_context; unsigned int sessionid; #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE - if (task->loginuid != -1) + if (uid_valid(task->loginuid)) return -EPERM; #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ if (!capable(CAP_AUDIT_CONTROL)) @@ -2322,8 +2308,10 @@ int audit_set_loginuid(uid_t loginuid) audit_log_format(ab, "login pid=%d uid=%u " "old auid=%u new auid=%u" " old ses=%u new ses=%u", - task->pid, task_uid(task), - task->loginuid, loginuid, + task->pid, + from_kuid(&init_user_ns, task_uid(task)), + from_kuid(&init_user_ns, task->loginuid), + from_kuid(&init_user_ns, loginuid), task->sessionid, sessionid); audit_log_end(ab); } @@ -2546,12 +2534,12 @@ int __audit_signal_info(int sig, struct task_struct *t) struct audit_aux_data_pids *axp; struct task_struct *tsk = current; struct audit_context *ctx = tsk->audit_context; - uid_t uid = current_uid(), t_uid = task_uid(t); + kuid_t uid = current_uid(), t_uid = task_uid(t); if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { audit_sig_pid = tsk->pid; - if (tsk->loginuid != -1) + if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else audit_sig_uid = uid; @@ -2672,8 +2660,8 @@ void __audit_mmap_fd(int fd, int flags) static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) { - uid_t auid, uid; - gid_t gid; + kuid_t auid, uid; + kgid_t gid; unsigned int sessionid; auid = audit_get_loginuid(current); @@ -2681,7 +2669,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) current_uid_gid(&uid, &gid); audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", - auid, uid, gid, sessionid); + from_kuid(&init_user_ns, auid), + from_kuid(&init_user_ns, uid), + from_kgid(&init_user_ns, gid), + sessionid); audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 79818507e444..13774b3b39aa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -88,11 +88,12 @@ static DEFINE_MUTEX(cgroup_root_mutex); /* * Generate an array of cgroup subsystem pointers. At boot time, this is - * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are + * populated with the built in subsystems, and modular subsystems are * registered after that. The mutable section of this array is protected by * cgroup_mutex. */ -#define SUBSYS(_x) &_x ## _subsys, +#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, +#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { #include <linux/cgroup_subsys.h> }; @@ -111,13 +112,13 @@ struct cgroupfs_root { * The bitmask of subsystems intended to be attached to this * hierarchy */ - unsigned long subsys_bits; + unsigned long subsys_mask; /* Unique id for this hierarchy. */ int hierarchy_id; /* The bitmask of subsystems currently attached to this hierarchy */ - unsigned long actual_subsys_bits; + unsigned long actual_subsys_mask; /* A list running through the attached subsystems */ struct list_head subsys_list; @@ -276,7 +277,8 @@ inline int cgroup_is_removed(const struct cgroup *cgrp) /* bits in struct cgroupfs_root flags field */ enum { - ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ + ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ + ROOT_XATTR, /* supports extended attributes */ }; static int cgroup_is_releasable(const struct cgroup *cgrp) @@ -556,7 +558,7 @@ static struct css_set *find_existing_css_set( * won't change, so no need for locking. */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - if (root->subsys_bits & (1UL << i)) { + if (root->subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -824,7 +826,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); -static int cgroup_populate_dir(struct cgroup *cgrp); +static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, + unsigned long subsys_mask); static const struct inode_operations cgroup_dir_inode_operations; static const struct file_operations proc_cgroupstats_operations; @@ -912,15 +915,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ BUG_ON(!list_empty(&cgrp->pidlists)); + simple_xattrs_free(&cgrp->xattrs); + kfree_rcu(cgrp, rcu_head); } else { struct cfent *cfe = __d_cfe(dentry); struct cgroup *cgrp = dentry->d_parent->d_fsdata; + struct cftype *cft = cfe->type; WARN_ONCE(!list_empty(&cfe->node) && cgrp != &cgrp->root->top_cgroup, "cfe still linked for %s\n", cfe->type->name); kfree(cfe); + simple_xattrs_free(&cft->xattrs); } iput(inode); } @@ -963,12 +970,29 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) return -ENOENT; } -static void cgroup_clear_directory(struct dentry *dir) +/** + * cgroup_clear_directory - selective removal of base and subsystem files + * @dir: directory containing the files + * @base_files: true if the base files should be removed + * @subsys_mask: mask of the subsystem ids whose files should be removed + */ +static void cgroup_clear_directory(struct dentry *dir, bool base_files, + unsigned long subsys_mask) { struct cgroup *cgrp = __d_cgrp(dir); + struct cgroup_subsys *ss; - while (!list_empty(&cgrp->files)) - cgroup_rm_file(cgrp, NULL); + for_each_subsys(cgrp->root, ss) { + struct cftype_set *set; + if (!test_bit(ss->subsys_id, &subsys_mask)) + continue; + list_for_each_entry(set, &ss->cftsets, node) + cgroup_rm_file(cgrp, set->cfts); + } + if (base_files) { + while (!list_empty(&cgrp->files)) + cgroup_rm_file(cgrp, NULL); + } } /* @@ -977,8 +1001,9 @@ static void cgroup_clear_directory(struct dentry *dir) static void cgroup_d_remove_dir(struct dentry *dentry) { struct dentry *parent; + struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - cgroup_clear_directory(dentry); + cgroup_clear_directory(dentry, true, root->subsys_mask); parent = dentry->d_parent; spin_lock(&parent->d_lock); @@ -1022,22 +1047,22 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) * returns an error, no reference counts are touched. */ static int rebind_subsystems(struct cgroupfs_root *root, - unsigned long final_bits) + unsigned long final_subsys_mask) { - unsigned long added_bits, removed_bits; + unsigned long added_mask, removed_mask; struct cgroup *cgrp = &root->top_cgroup; int i; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); - removed_bits = root->actual_subsys_bits & ~final_bits; - added_bits = final_bits & ~root->actual_subsys_bits; + removed_mask = root->actual_subsys_mask & ~final_subsys_mask; + added_mask = final_subsys_mask & ~root->actual_subsys_mask; /* Check that any added subsystems are currently free */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; struct cgroup_subsys *ss = subsys[i]; - if (!(bit & added_bits)) + if (!(bit & added_mask)) continue; /* * Nobody should tell us to do a subsys that doesn't exist: @@ -1062,7 +1087,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; unsigned long bit = 1UL << i; - if (bit & added_bits) { + if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i]); @@ -1075,7 +1100,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (ss->bind) ss->bind(cgrp); /* refcount was already taken, and we're keeping it */ - } else if (bit & removed_bits) { + } else if (bit & removed_mask) { /* We're removing this subsystem */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); @@ -1088,7 +1113,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, list_move(&ss->sibling, &rootnode.subsys_list); /* subsystem is now free - drop reference on module */ module_put(ss->module); - } else if (bit & final_bits) { + } else if (bit & final_subsys_mask) { /* Subsystem state should already exist */ BUG_ON(ss == NULL); BUG_ON(!cgrp->subsys[i]); @@ -1105,7 +1130,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); } } - root->subsys_bits = root->actual_subsys_bits = final_bits; + root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; synchronize_rcu(); return 0; @@ -1121,6 +1146,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",%s", ss->name); if (test_bit(ROOT_NOPREFIX, &root->flags)) seq_puts(seq, ",noprefix"); + if (test_bit(ROOT_XATTR, &root->flags)) + seq_puts(seq, ",xattr"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); if (clone_children(&root->top_cgroup)) @@ -1132,7 +1159,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) } struct cgroup_sb_opts { - unsigned long subsys_bits; + unsigned long subsys_mask; unsigned long flags; char *release_agent; bool clone_children; @@ -1189,6 +1216,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) opts->clone_children = true; continue; } + if (!strcmp(token, "xattr")) { + set_bit(ROOT_XATTR, &opts->flags); + continue; + } if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) @@ -1237,7 +1268,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) return -EINVAL; - set_bit(i, &opts->subsys_bits); + set_bit(i, &opts->subsys_mask); one_ss = true; break; @@ -1258,7 +1289,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; if (ss->disabled) continue; - set_bit(i, &opts->subsys_bits); + set_bit(i, &opts->subsys_mask); } } @@ -1270,19 +1301,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * the cpuset subsystem. */ if (test_bit(ROOT_NOPREFIX, &opts->flags) && - (opts->subsys_bits & mask)) + (opts->subsys_mask & mask)) return -EINVAL; /* Can't specify "none" and some subsystems */ - if (opts->subsys_bits && opts->none) + if (opts->subsys_mask && opts->none) return -EINVAL; /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ - if (!opts->subsys_bits && !opts->name) + if (!opts->subsys_mask && !opts->name) return -EINVAL; /* @@ -1291,10 +1322,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * take duplicate reference counts on a subsystem that's already used, * but rebind_subsystems handles this case. */ - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; - if (!(bit & opts->subsys_bits)) + if (!(bit & opts->subsys_mask)) continue; if (!try_module_get(subsys[i]->module)) { module_pin_failed = true; @@ -1307,11 +1338,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * raced with a module_delete call, and to the user this is * essentially a "subsystem doesn't exist" case. */ - for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { + for (i--; i >= 0; i--) { /* drop refcounts only on the ones we took */ unsigned long bit = 1UL << i; - if (!(bit & opts->subsys_bits)) + if (!(bit & opts->subsys_mask)) continue; module_put(subsys[i]->module); } @@ -1321,13 +1352,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return 0; } -static void drop_parsed_module_refcounts(unsigned long subsys_bits) +static void drop_parsed_module_refcounts(unsigned long subsys_mask) { int i; - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; - if (!(bit & subsys_bits)) + if (!(bit & subsys_mask)) continue; module_put(subsys[i]->module); } @@ -1339,6 +1370,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; struct cgroup_sb_opts opts; + unsigned long added_mask, removed_mask; mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); @@ -1350,27 +1382,31 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; /* See feature-removal-schedule.txt */ - if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) + if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; + /* Don't allow flags or name to change at remount */ if (opts.flags != root->flags || (opts.name && strcmp(opts.name, root->name))) { ret = -EINVAL; - drop_parsed_module_refcounts(opts.subsys_bits); + drop_parsed_module_refcounts(opts.subsys_mask); goto out_unlock; } - ret = rebind_subsystems(root, opts.subsys_bits); + ret = rebind_subsystems(root, opts.subsys_mask); if (ret) { - drop_parsed_module_refcounts(opts.subsys_bits); + drop_parsed_module_refcounts(opts.subsys_mask); goto out_unlock; } /* clear out any existing files and repopulate subsystem files */ - cgroup_clear_directory(cgrp->dentry); - cgroup_populate_dir(cgrp); + cgroup_clear_directory(cgrp->dentry, false, removed_mask); + /* re-populate subsystem files */ + cgroup_populate_dir(cgrp, false, added_mask); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1401,6 +1437,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) mutex_init(&cgrp->pidlist_mutex); INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); + simple_xattrs_init(&cgrp->xattrs); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -1455,8 +1492,8 @@ static int cgroup_test_super(struct super_block *sb, void *data) * If we asked for subsystems (or explicitly for no * subsystems) then they must match */ - if ((opts->subsys_bits || opts->none) - && (opts->subsys_bits != root->subsys_bits)) + if ((opts->subsys_mask || opts->none) + && (opts->subsys_mask != root->subsys_mask)) return 0; return 1; @@ -1466,7 +1503,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) { struct cgroupfs_root *root; - if (!opts->subsys_bits && !opts->none) + if (!opts->subsys_mask && !opts->none) return NULL; root = kzalloc(sizeof(*root), GFP_KERNEL); @@ -1479,7 +1516,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) } init_cgroup_root(root); - root->subsys_bits = opts->subsys_bits; + root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); @@ -1511,7 +1548,7 @@ static int cgroup_set_super(struct super_block *sb, void *data) if (!opts->new_root) return -EINVAL; - BUG_ON(!opts->subsys_bits && !opts->none); + BUG_ON(!opts->subsys_mask && !opts->none); ret = set_anon_super(sb, NULL); if (ret) @@ -1629,7 +1666,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; - ret = rebind_subsystems(root, root->subsys_bits); + ret = rebind_subsystems(root, root->subsys_mask); if (ret == -EBUSY) { free_cg_links(&tmp_cg_links); goto unlock_drop; @@ -1669,7 +1706,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cred = override_creds(&init_cred); - cgroup_populate_dir(root_cgrp); + cgroup_populate_dir(root_cgrp, true, root->subsys_mask); revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); @@ -1681,7 +1718,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ cgroup_drop_root(opts.new_root); /* no subsys rebinding, so refcounts don't change */ - drop_parsed_module_refcounts(opts.subsys_bits); + drop_parsed_module_refcounts(opts.subsys_mask); } kfree(opts.release_agent); @@ -1695,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, drop_new_super: deactivate_locked_super(sb); drop_modules: - drop_parsed_module_refcounts(opts.subsys_bits); + drop_parsed_module_refcounts(opts.subsys_mask); out_err: kfree(opts.release_agent); kfree(opts.name); @@ -1745,6 +1782,8 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); + simple_xattrs_free(&cgrp->xattrs); + kill_litter_super(sb); cgroup_drop_root(root); } @@ -2551,6 +2590,64 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } +static struct simple_xattrs *__d_xattrs(struct dentry *dentry) +{ + if (S_ISDIR(dentry->d_inode->i_mode)) + return &__d_cgrp(dentry)->xattrs; + else + return &__d_cft(dentry)->xattrs; +} + +static inline int xattr_enabled(struct dentry *dentry) +{ + struct cgroupfs_root *root = dentry->d_sb->s_fs_info; + return test_bit(ROOT_XATTR, &root->flags); +} + +static bool is_valid_xattr(const char *name) +{ + if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) + return true; + return false; +} + +static int cgroup_setxattr(struct dentry *dentry, const char *name, + const void *val, size_t size, int flags) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + if (!is_valid_xattr(name)) + return -EINVAL; + return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); +} + +static int cgroup_removexattr(struct dentry *dentry, const char *name) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + if (!is_valid_xattr(name)) + return -EINVAL; + return simple_xattr_remove(__d_xattrs(dentry), name); +} + +static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, + void *buf, size_t size) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + if (!is_valid_xattr(name)) + return -EINVAL; + return simple_xattr_get(__d_xattrs(dentry), name, buf, size); +} + +static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + if (!xattr_enabled(dentry)) + return -EOPNOTSUPP; + return simple_xattr_list(__d_xattrs(dentry), buf, size); +} + static const struct file_operations cgroup_file_operations = { .read = cgroup_file_read, .write = cgroup_file_write, @@ -2559,11 +2656,22 @@ static const struct file_operations cgroup_file_operations = { .release = cgroup_file_release, }; +static const struct inode_operations cgroup_file_inode_operations = { + .setxattr = cgroup_setxattr, + .getxattr = cgroup_getxattr, + .listxattr = cgroup_listxattr, + .removexattr = cgroup_removexattr, +}; + static const struct inode_operations cgroup_dir_inode_operations = { .lookup = cgroup_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, + .setxattr = cgroup_setxattr, + .getxattr = cgroup_getxattr, + .listxattr = cgroup_listxattr, + .removexattr = cgroup_removexattr, }; static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -2611,6 +2719,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, } else if (S_ISREG(mode)) { inode->i_size = 0; inode->i_fop = &cgroup_file_operations; + inode->i_op = &cgroup_file_inode_operations; } d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ @@ -2671,7 +2780,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) } static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, - const struct cftype *cft) + struct cftype *cft) { struct dentry *dir = cgrp->dentry; struct cgroup *parent = __d_cgrp(dir); @@ -2681,6 +2790,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; + simple_xattrs_init(&cft->xattrs); + /* does @cft->flags tell us to skip creation on @cgrp? */ if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) return 0; @@ -2721,9 +2832,9 @@ out: } static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - const struct cftype cfts[], bool is_add) + struct cftype cfts[], bool is_add) { - const struct cftype *cft; + struct cftype *cft; int err, ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { @@ -2757,7 +2868,7 @@ static void cgroup_cfts_prepare(void) } static void cgroup_cfts_commit(struct cgroup_subsys *ss, - const struct cftype *cfts, bool is_add) + struct cftype *cfts, bool is_add) __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) { LIST_HEAD(pending); @@ -2808,7 +2919,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; @@ -2838,7 +2949,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes); * Returns 0 on successful unregistration, -ENOENT if @cfts is not * registered with @ss. */ -int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; @@ -3843,18 +3954,29 @@ static struct cftype files[] = { { } /* terminate */ }; -static int cgroup_populate_dir(struct cgroup *cgrp) +/** + * cgroup_populate_dir - selectively creation of files in a directory + * @cgrp: target cgroup + * @base_files: true if the base files should be added + * @subsys_mask: mask of the subsystem ids whose files should be added + */ +static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, + unsigned long subsys_mask) { int err; struct cgroup_subsys *ss; - err = cgroup_addrm_files(cgrp, NULL, files, true); - if (err < 0) - return err; + if (base_files) { + err = cgroup_addrm_files(cgrp, NULL, files, true); + if (err < 0) + return err; + } /* process cftsets of each subsystem */ for_each_subsys(cgrp->root, ss) { struct cftype_set *set; + if (!test_bit(ss->subsys_id, &subsys_mask)) + continue; list_for_each_entry(set, &ss->cftsets, node) cgroup_addrm_files(cgrp, ss, set->cfts, true); @@ -3954,8 +4076,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); for_each_subsys(root, ss) { - struct cgroup_subsys_state *css = ss->create(cgrp); + struct cgroup_subsys_state *css; + css = ss->create(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_destroy; @@ -3969,6 +4092,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* At error, ->destroy() callback has to free assigned ID. */ if (clone_children(parent) && ss->post_clone) ss->post_clone(cgrp); + + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && + parent->parent) { + pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); + if (!strcmp(ss->name, "memory")) + pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + ss->warned_broken_hierarchy = true; + } } list_add(&cgrp->sibling, &cgrp->parent->children); @@ -3988,7 +4120,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail(&cgrp->allcg_node, &root->allcg_list); - err = cgroup_populate_dir(cgrp); + err = cgroup_populate_dir(cgrp, true, root->subsys_mask); /* If err < 0, we have a half-filled directory - oh well ;) */ mutex_unlock(&cgroup_mutex); @@ -4321,8 +4453,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * since cgroup_init_subsys will have already taken care of it. */ if (ss->module == NULL) { - /* a few sanity checks */ - BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); + /* a sanity check */ BUG_ON(subsys[ss->subsys_id] != ss); return 0; } @@ -4330,24 +4461,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) /* init base cftset */ cgroup_init_cftsets(ss); - /* - * need to register a subsys id before anything else - for example, - * init_cgroup_css needs it. - */ mutex_lock(&cgroup_mutex); - /* find the first empty slot in the array */ - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { - if (subsys[i] == NULL) - break; - } - if (i == CGROUP_SUBSYS_COUNT) { - /* maximum number of subsystems already registered! */ - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - /* assign ourselves the subsys_id */ - ss->subsys_id = i; - subsys[i] = ss; + subsys[ss->subsys_id] = ss; /* * no ss->create seems to need anything important in the ss struct, so @@ -4356,7 +4471,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) css = ss->create(dummytop); if (IS_ERR(css)) { /* failure case - need to deassign the subsys[] slot. */ - subsys[i] = NULL; + subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); return PTR_ERR(css); } @@ -4372,7 +4487,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) if (ret) { dummytop->subsys[ss->subsys_id] = NULL; ss->destroy(dummytop); - subsys[i] = NULL; + subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); return ret; } @@ -4439,7 +4554,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); /* deassign the subsys_id */ - BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); subsys[ss->subsys_id] = NULL; /* remove subsystem from rootnode's list of subsystems */ @@ -4502,10 +4616,13 @@ int __init cgroup_init_early(void) for (i = 0; i < CSS_SET_TABLE_SIZE; i++) INIT_HLIST_HEAD(&css_set_table[i]); - /* at bootup time, we don't worry about modular subsystems */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + /* at bootup time, we don't worry about modular subsystems */ + if (!ss || ss->module) + continue; + BUG_ON(!ss->name); BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); BUG_ON(!ss->create); @@ -4538,9 +4655,12 @@ int __init cgroup_init(void) if (err) return err; - /* at bootup time, we don't worry about modular subsystems */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + + /* at bootup time, we don't worry about modular subsystems */ + if (!ss || ss->module) + continue; if (!ss->early_init) cgroup_init_subsys(ss); if (ss->use_id) @@ -4735,13 +4855,16 @@ void cgroup_fork_callbacks(struct task_struct *child) { if (need_forkexit_callback) { int i; - /* - * forkexit callbacks are only supported for builtin - * subsystems, and the builtin section of the subsys array is - * immutable, so we don't need to lock the subsys array here. - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + + /* + * forkexit callbacks are only supported for + * builtin subsystems. + */ + if (!ss || ss->module) + continue; + if (ss->fork) ss->fork(child); } @@ -4846,12 +4969,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) tsk->cgroups = &init_css_set; if (run_callbacks && need_forkexit_callback) { - /* - * modular subsystems can't use callbacks, so no need to lock - * the subsys array - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + + /* modular subsystems can't use callbacks */ + if (!ss || ss->module) + continue; + if (ss->exit) { struct cgroup *old_cgrp = rcu_dereference_raw(cg->subsys[i])->cgroup; @@ -5037,13 +5161,17 @@ static int __init cgroup_disable(char *str) while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; - /* - * cgroup_disable, being at boot time, can't know about module - * subsystems, so we don't worry about them. - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + /* + * cgroup_disable, being at boot time, can't + * know about module subsystems, so we don't + * worry about them. + */ + if (!ss || ss->module) + continue; + if (!strcmp(token, ss->name)) { ss->disabled = 1; printk(KERN_INFO "Disabling %s control group" diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 3649fc6b3eaa..b1724ce98981 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -373,4 +373,12 @@ struct cgroup_subsys freezer_subsys = { .can_attach = freezer_can_attach, .fork = freezer_fork, .base_cftypes = files, + + /* + * freezer subsys doesn't handle hierarchy at all. Frozen state + * should be inherited through the hierarchy - if a parent is + * frozen, all its children should be frozen. Fix it and remove + * the following. + */ + .broken_hierarchy = true, }; diff --git a/kernel/cpu.c b/kernel/cpu.c index e615dfbcf794..42bd331ee0ab 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -80,6 +80,10 @@ void put_online_cpus(void) if (cpu_hotplug.active_writer == current) return; mutex_lock(&cpu_hotplug.lock); + + if (WARN_ON(!cpu_hotplug.refcount)) + cpu_hotplug.refcount++; /* try to fix things up */ + if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) wake_up_process(cpu_hotplug.active_writer); mutex_unlock(&cpu_hotplug.lock); @@ -447,14 +451,6 @@ EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -void __weak arch_disable_nonboot_cpus_begin(void) -{ -} - -void __weak arch_disable_nonboot_cpus_end(void) -{ -} - int disable_nonboot_cpus(void) { int cpu, first_cpu, error = 0; @@ -466,7 +462,6 @@ int disable_nonboot_cpus(void) * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); - arch_disable_nonboot_cpus_begin(); printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { @@ -482,8 +477,6 @@ int disable_nonboot_cpus(void) } } - arch_disable_nonboot_cpus_end(); - if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ diff --git a/kernel/cred.c b/kernel/cred.c index de728ac50d82..48cea3da6d05 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -799,9 +799,15 @@ static void dump_invalid_creds(const struct cred *cred, const char *label, atomic_read(&cred->usage), read_cred_subscribers(cred)); printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", - cred->uid, cred->euid, cred->suid, cred->fsuid); + from_kuid_munged(&init_user_ns, cred->uid), + from_kuid_munged(&init_user_ns, cred->euid), + from_kuid_munged(&init_user_ns, cred->suid), + from_kuid_munged(&init_user_ns, cred->fsuid)); printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", - cred->gid, cred->egid, cred->sgid, cred->fsgid); + from_kgid_munged(&init_user_ns, cred->gid), + from_kgid_munged(&init_user_ns, cred->egid), + from_kgid_munged(&init_user_ns, cred->sgid), + from_kgid_munged(&init_user_ns, cred->fsgid)); #ifdef CONFIG_SECURITY printk(KERN_ERR "CRED: ->security is %p\n", cred->security); if ((unsigned long) cred->security >= PAGE_SIZE && diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0557f24c6bca..17e073c309e6 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -672,6 +672,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) { struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; + int ret = 0; + + if (arch_kgdb_ops.enable_nmi) + arch_kgdb_ops.enable_nmi(0); ks->cpu = raw_smp_processor_id(); ks->ex_vector = evector; @@ -681,11 +685,15 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) ks->linux_regs = regs; if (kgdb_reenter_check(ks)) - return 0; /* Ouch, double exception ! */ + goto out; /* Ouch, double exception ! */ if (kgdb_info[ks->cpu].enter_kgdb != 0) - return 0; + goto out; - return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); + ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); +out: + if (arch_kgdb_ops.enable_nmi) + arch_kgdb_ops.enable_nmi(1); + return ret; } int kgdb_nmicallback(int cpu, void *regs) diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 31df1706b9a9..1261dc7eaeb9 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -21,6 +21,7 @@ #include <linux/smp.h> #include <linux/utsname.h> #include <linux/vmalloc.h> +#include <linux/atomic.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/init.h> @@ -2107,6 +2108,32 @@ static int kdb_dmesg(int argc, const char **argv) return 0; } #endif /* CONFIG_PRINTK */ + +/* Make sure we balance enable/disable calls, must disable first. */ +static atomic_t kdb_nmi_disabled; + +static int kdb_disable_nmi(int argc, const char *argv[]) +{ + if (atomic_read(&kdb_nmi_disabled)) + return 0; + atomic_set(&kdb_nmi_disabled, 1); + arch_kgdb_ops.enable_nmi(0); + return 0; +} + +static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp) +{ + if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0)) + return -EINVAL; + arch_kgdb_ops.enable_nmi(1); + return 0; +} + +static const struct kernel_param_ops kdb_param_ops_enable_nmi = { + .set = kdb_param_enable_nmi, +}; +module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600); + /* * kdb_cpu - This function implements the 'cpu' command. * cpu [<cpunum>] @@ -2851,6 +2878,10 @@ static void __init kdb_inittab(void) kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", "Display syslog buffer", 0, KDB_REPEAT_NONE); #endif + if (arch_kgdb_ops.enable_nmi) { + kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", + "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); + } kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 98d4597f43d6..c77206184b8b 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -159,6 +159,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) int rctx; struct perf_callchain_entry *entry; + int kernel = !event->attr.exclude_callchain_kernel; + int user = !event->attr.exclude_callchain_user; + + if (!kernel && !user) + return NULL; entry = get_callchain_entry(&rctx); if (rctx == -1) @@ -169,24 +174,29 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) entry->nr = 0; - if (!user_mode(regs)) { + if (kernel && !user_mode(regs)) { perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; } - if (regs) { - /* - * Disallow cross-task user callchains. - */ - if (event->ctx->task && event->ctx->task != current) - goto exit_put; - - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); + if (user) { + if (!user_mode(regs)) { + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + /* + * Disallow cross-task user callchains. + */ + if (event->ctx->task && event->ctx->task != current) + goto exit_put; + + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } } exit_put: diff --git a/kernel/events/core.c b/kernel/events/core.c index 7fee567153f0..cda3ebd49e86 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -36,6 +36,7 @@ #include <linux/perf_event.h> #include <linux/ftrace_event.h> #include <linux/hw_breakpoint.h> +#include <linux/mm_types.h> #include "internal.h" @@ -467,14 +468,13 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; - struct file *file; - int ret = 0, fput_needed; + struct fd f = fdget(fd); + int ret = 0; - file = fget_light(fd, &fput_needed); - if (!file) + if (!f.file) return -EBADF; - css = cgroup_css_from_dir(file, perf_subsys_id); + css = cgroup_css_from_dir(f.file, perf_subsys_id); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; @@ -500,7 +500,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, ret = -EINVAL; } out: - fput_light(file, fput_needed); + fdput(f); return ret; } @@ -3233,21 +3233,18 @@ unlock: static const struct file_operations perf_fops; -static struct file *perf_fget_light(int fd, int *fput_needed) +static inline int perf_fget_light(int fd, struct fd *p) { - struct file *file; - - file = fget_light(fd, fput_needed); - if (!file) - return ERR_PTR(-EBADF); + struct fd f = fdget(fd); + if (!f.file) + return -EBADF; - if (file->f_op != &perf_fops) { - fput_light(file, *fput_needed); - *fput_needed = 0; - return ERR_PTR(-EBADF); + if (f.file->f_op != &perf_fops) { + fdput(f); + return -EBADF; } - - return file; + *p = f; + return 0; } static int perf_event_set_output(struct perf_event *event, @@ -3279,22 +3276,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: { - struct file *output_file = NULL; - struct perf_event *output_event = NULL; - int fput_needed = 0; int ret; - if (arg != -1) { - output_file = perf_fget_light(arg, &fput_needed); - if (IS_ERR(output_file)) - return PTR_ERR(output_file); - output_event = output_file->private_data; + struct perf_event *output_event; + struct fd output; + ret = perf_fget_light(arg, &output); + if (ret) + return ret; + output_event = output.file->private_data; + ret = perf_event_set_output(event, output_event); + fdput(output); + } else { + ret = perf_event_set_output(event, NULL); } - - ret = perf_event_set_output(event, output_event); - if (output_event) - fput_light(output_file, fput_needed); - return ret; } @@ -3677,7 +3671,7 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; return ret; @@ -3764,6 +3758,132 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); +static void +perf_output_sample_regs(struct perf_output_handle *handle, + struct pt_regs *regs, u64 mask) +{ + int bit; + + for_each_set_bit(bit, (const unsigned long *) &mask, + sizeof(mask) * BITS_PER_BYTE) { + u64 val; + + val = perf_reg_value(regs, bit); + perf_output_put(handle, val); + } +} + +static void perf_sample_regs_user(struct perf_regs_user *regs_user, + struct pt_regs *regs) +{ + if (!user_mode(regs)) { + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + regs_user->regs = regs; + regs_user->abi = perf_reg_abi(current); + } +} + +/* + * Get remaining task size from user stack pointer. + * + * It'd be better to take stack vma map and limit this more + * precisly, but there's no way to get it safely under interrupt, + * so using TASK_SIZE as limit. + */ +static u64 perf_ustack_task_size(struct pt_regs *regs) +{ + unsigned long addr = perf_user_stack_pointer(regs); + + if (!addr || addr >= TASK_SIZE) + return 0; + + return TASK_SIZE - addr; +} + +static u16 +perf_sample_ustack_size(u16 stack_size, u16 header_size, + struct pt_regs *regs) +{ + u64 task_size; + + /* No regs, no stack pointer, no dump. */ + if (!regs) + return 0; + + /* + * Check if we fit in with the requested stack size into the: + * - TASK_SIZE + * If we don't, we limit the size to the TASK_SIZE. + * + * - remaining sample size + * If we don't, we customize the stack size to + * fit in to the remaining sample size. + */ + + task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); + stack_size = min(stack_size, (u16) task_size); + + /* Current header size plus static size and dynamic size. */ + header_size += 2 * sizeof(u64); + + /* Do we fit in with the current stack dump size? */ + if ((u16) (header_size + stack_size) < header_size) { + /* + * If we overflow the maximum size for the sample, + * we customize the stack dump size to fit in. + */ + stack_size = USHRT_MAX - header_size - sizeof(u64); + stack_size = round_up(stack_size, sizeof(u64)); + } + + return stack_size; +} + +static void +perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, + struct pt_regs *regs) +{ + /* Case of a kernel thread, nothing to dump */ + if (!regs) { + u64 size = 0; + perf_output_put(handle, size); + } else { + unsigned long sp; + unsigned int rem; + u64 dyn_size; + + /* + * We dump: + * static size + * - the size requested by user or the best one we can fit + * in to the sample max size + * data + * - user stack dump data + * dynamic size + * - the actual dumped size + */ + + /* Static size. */ + perf_output_put(handle, dump_size); + + /* Data. */ + sp = perf_user_stack_pointer(regs); + rem = __output_copy_user(handle, (void *) sp, dump_size); + dyn_size = dump_size - rem; + + perf_output_skip(handle, rem); + + /* Dynamic size. */ + perf_output_put(handle, dyn_size); + } +} + static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -4024,6 +4144,28 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_put(handle, nr); } } + + if (sample_type & PERF_SAMPLE_REGS_USER) { + u64 abi = data->regs_user.abi; + + /* + * If there are no regs to dump, notice it through + * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). + */ + perf_output_put(handle, abi); + + if (abi) { + u64 mask = event->attr.sample_regs_user; + perf_output_sample_regs(handle, + data->regs_user.regs, + mask); + } + } + + if (sample_type & PERF_SAMPLE_STACK_USER) + perf_output_sample_ustack(handle, + data->stack_user_size, + data->regs_user.regs); } void perf_prepare_sample(struct perf_event_header *header, @@ -4075,6 +4217,49 @@ void perf_prepare_sample(struct perf_event_header *header, } header->size += size; } + + if (sample_type & PERF_SAMPLE_REGS_USER) { + /* regs dump ABI info */ + int size = sizeof(u64); + + perf_sample_regs_user(&data->regs_user, regs); + + if (data->regs_user.regs) { + u64 mask = event->attr.sample_regs_user; + size += hweight64(mask) * sizeof(u64); + } + + header->size += size; + } + + if (sample_type & PERF_SAMPLE_STACK_USER) { + /* + * Either we need PERF_SAMPLE_STACK_USER bit to be allways + * processed as the last one or have additional check added + * in case new sample type is added, because we could eat + * up the rest of the sample size. + */ + struct perf_regs_user *uregs = &data->regs_user; + u16 stack_size = event->attr.sample_stack_user; + u16 size = sizeof(u64); + + if (!uregs->abi) + perf_sample_regs_user(uregs, regs); + + stack_size = perf_sample_ustack_size(stack_size, header->size, + uregs->regs); + + /* + * If there is something to dump, add space for the dump + * itself and for the field that tells the dynamic size, + * which is how many have been actually dumped. + */ + if (stack_size) + size += sizeof(u64) + stack_size; + + data->stack_user_size = stack_size; + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -6151,6 +6336,28 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->branch_sample_type = mask; } } + + if (attr->sample_type & PERF_SAMPLE_REGS_USER) { + ret = perf_reg_validate(attr->sample_regs_user); + if (ret) + return ret; + } + + if (attr->sample_type & PERF_SAMPLE_STACK_USER) { + if (!arch_perf_have_user_stack_dump()) + return -ENOSYS; + + /* + * We have __u32 type for the size, but so far + * we can only use __u16 as maximum due to the + * __u16 sample size limit. + */ + if (attr->sample_stack_user >= USHRT_MAX) + ret = -EINVAL; + else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) + ret = -EINVAL; + } + out: return ret; @@ -6229,12 +6436,11 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; - struct file *group_file = NULL; + struct fd group = {NULL, 0}; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; int move_group = 0; - int fput_needed = 0; int err; /* for future expandability... */ @@ -6264,17 +6470,15 @@ SYSCALL_DEFINE5(perf_event_open, if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) return -EINVAL; - event_fd = get_unused_fd_flags(O_RDWR); + event_fd = get_unused_fd(); if (event_fd < 0) return event_fd; if (group_fd != -1) { - group_file = perf_fget_light(group_fd, &fput_needed); - if (IS_ERR(group_file)) { - err = PTR_ERR(group_file); + err = perf_fget_light(group_fd, &group); + if (err) goto err_fd; - } - group_leader = group_file->private_data; + group_leader = group.file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -6450,7 +6654,7 @@ SYSCALL_DEFINE5(perf_event_open, * of the group leader will find the pointer to itself in * perf_group_detach(). */ - fput_light(group_file, fput_needed); + fdput(group); fd_install(event_fd, event_file); return event_fd; @@ -6464,7 +6668,7 @@ err_task: if (task) put_task_struct(task); err_group_fd: - fput_light(group_file, fput_needed); + fdput(group); err_fd: put_unused_fd(event_fd); return err; @@ -7289,5 +7493,12 @@ struct cgroup_subsys perf_subsys = { .destroy = perf_cgroup_destroy, .exit = perf_cgroup_exit, .attach = perf_cgroup_attach, + + /* + * perf_event cgroup doesn't handle nesting correctly. + * ctx->nr_cgroups adjustments should be propagated through the + * cgroup hierarchy. Fix it and remove the following. + */ + .broken_hierarchy = true, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/internal.h b/kernel/events/internal.h index a096c19f2c2a..d56a64c99a8b 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -2,6 +2,7 @@ #define _KERNEL_EVENTS_INTERNAL_H #include <linux/hardirq.h> +#include <linux/uaccess.h> /* Buffer handling */ @@ -76,30 +77,53 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } -static inline void -__output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) +#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ +static inline unsigned int \ +func_name(struct perf_output_handle *handle, \ + const void *buf, unsigned int len) \ +{ \ + unsigned long size, written; \ + \ + do { \ + size = min_t(unsigned long, handle->size, len); \ + \ + written = memcpy_func(handle->addr, buf, size); \ + \ + len -= written; \ + handle->addr += written; \ + buf += written; \ + handle->size -= written; \ + if (!handle->size) { \ + struct ring_buffer *rb = handle->rb; \ + \ + handle->page++; \ + handle->page &= rb->nr_pages - 1; \ + handle->addr = rb->data_pages[handle->page]; \ + handle->size = PAGE_SIZE << page_order(rb); \ + } \ + } while (len && written == size); \ + \ + return len; \ +} + +static inline int memcpy_common(void *dst, const void *src, size_t n) { - do { - unsigned long size = min_t(unsigned long, handle->size, len); - - memcpy(handle->addr, buf, size); - - len -= size; - handle->addr += size; - buf += size; - handle->size -= size; - if (!handle->size) { - struct ring_buffer *rb = handle->rb; - - handle->page++; - handle->page &= rb->nr_pages - 1; - handle->addr = rb->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(rb); - } - } while (len); + memcpy(dst, src, n); + return n; } +DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) + +#define MEMCPY_SKIP(dst, src, n) (n) + +DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) + +#ifndef arch_perf_out_copy_user +#define arch_perf_out_copy_user __copy_from_user_inatomic +#endif + +DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) + /* Callchain handling */ extern struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs); @@ -134,4 +158,20 @@ static inline void put_recursion_context(int *recursion, int rctx) recursion[rctx]--; } +#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP +static inline bool arch_perf_have_user_stack_dump(void) +{ + return true; +} + +#define perf_user_stack_pointer(regs) user_stack_pointer(regs) +#else +static inline bool arch_perf_have_user_stack_dump(void) +{ + return false; +} + +#define perf_user_stack_pointer(regs) 0 +#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */ + #endif /* _KERNEL_EVENTS_INTERNAL_H */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 6ddaba43fb7a..23cb34ff3973 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -182,10 +182,16 @@ out: return -ENOSPC; } -void perf_output_copy(struct perf_output_handle *handle, +unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { - __output_copy(handle, buf, len); + return __output_copy(handle, buf, len); +} + +unsigned int perf_output_skip(struct perf_output_handle *handle, + unsigned int len) +{ + return __output_skip(handle, NULL, len); } void perf_output_end(struct perf_output_handle *handle) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c08a22d02f72..98256bc71ee1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -141,10 +141,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; pte_t *ptep; int err; + /* For mmu_notifiers */ + const unsigned long mmun_start = addr; + const unsigned long mmun_end = addr + PAGE_SIZE; /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) @@ -173,6 +177,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; } @@ -280,12 +285,10 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ if (ret <= 0) return ret; - lock_page(page); vaddr_new = kmap_atomic(page); vaddr &= ~PAGE_MASK; memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); kunmap_atomic(vaddr_new); - unlock_page(page); put_page(page); @@ -334,7 +337,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned */ result = is_swbp_at_addr(mm, vaddr); if (result == 1) - return -EEXIST; + return 0; if (result) return result; @@ -347,24 +350,22 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned * @mm: the probed process address space. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. - * @verify: if true, verify existance of breakpoint instruction. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ int __weak -set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify) +set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - if (verify) { - int result; + int result; + + result = is_swbp_at_addr(mm, vaddr); + if (!result) + return -EINVAL; - result = is_swbp_at_addr(mm, vaddr); - if (!result) - return -EINVAL; + if (result != 1) + return result; - if (result != 1) - return result; - } return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); } @@ -415,11 +416,10 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe; - unsigned long flags; - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); uprobe = __find_uprobe(inode, offset); - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); return uprobe; } @@ -466,12 +466,11 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { - unsigned long flags; struct uprobe *u; - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); u = __insert_uprobe(uprobe); - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); /* For now assume that the instruction need not be single-stepped */ uprobe->flags |= UPROBE_SKIP_SSTEP; @@ -649,6 +648,7 @@ static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr) { + bool first_uprobe; int ret; /* @@ -659,7 +659,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, * Hence behave as if probe already existed. */ if (!uprobe->consumers) - return -EEXIST; + return 0; if (!(uprobe->flags & UPROBE_COPY_INSN)) { ret = copy_insn(uprobe, vma->vm_file); @@ -681,17 +681,18 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, } /* - * Ideally, should be updating the probe count after the breakpoint - * has been successfully inserted. However a thread could hit the - * breakpoint we just inserted even before the probe count is - * incremented. If this is the first breakpoint placed, breakpoint - * notifier might ignore uprobes and pass the trap to the thread. - * Hence increment before and decrement on failure. + * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), + * the task can hit this breakpoint right after __replace_page(). */ - atomic_inc(&mm->uprobes_state.count); + first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); + if (first_uprobe) + set_bit(MMF_HAS_UPROBES, &mm->flags); + ret = set_swbp(&uprobe->arch, mm, vaddr); - if (ret) - atomic_dec(&mm->uprobes_state.count); + if (!ret) + clear_bit(MMF_RECALC_UPROBES, &mm->flags); + else if (first_uprobe) + clear_bit(MMF_HAS_UPROBES, &mm->flags); return ret; } @@ -699,8 +700,12 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static void remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) - atomic_dec(&mm->uprobes_state.count); + /* can happen if uprobe_register() fails */ + if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) + return; + + set_bit(MMF_RECALC_UPROBES, &mm->flags); + set_orig_insn(&uprobe->arch, mm, vaddr); } /* @@ -710,11 +715,9 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad */ static void delete_uprobe(struct uprobe *uprobe) { - unsigned long flags; - - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); iput(uprobe->inode); put_uprobe(uprobe); atomic_dec(&uprobe_events); @@ -737,7 +740,6 @@ static struct map_info * build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { unsigned long pgoff = offset >> PAGE_SHIFT; - struct prio_tree_iter iter; struct vm_area_struct *vma; struct map_info *curr = NULL; struct map_info *prev = NULL; @@ -746,7 +748,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) again: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; @@ -831,17 +833,11 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; - if (is_register) { + if (is_register) err = install_breakpoint(uprobe, mm, vma, info->vaddr); - /* - * We can race against uprobe_mmap(), see the - * comment near uprobe_hash(). - */ - if (err == -EEXIST) - err = 0; - } else { + else remove_breakpoint(uprobe, mm, info->vaddr); - } + unlock: up_write(&mm->mmap_sem); free: @@ -908,7 +904,8 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * } mutex_unlock(uprobes_hash(inode)); - put_uprobe(uprobe); + if (uprobe) + put_uprobe(uprobe); return ret; } @@ -978,7 +975,6 @@ static void build_probe_list(struct inode *inode, struct list_head *head) { loff_t min, max; - unsigned long flags; struct rb_node *n, *t; struct uprobe *u; @@ -986,7 +982,7 @@ static void build_probe_list(struct inode *inode, min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { @@ -1004,27 +1000,20 @@ static void build_probe_list(struct inode *inode, atomic_inc(&u->ref); } } - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); } /* - * Called from mmap_region. - * called with mm->mmap_sem acquired. + * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. * - * Return -ve no if we fail to insert probes and we cannot - * bail-out. - * Return 0 otherwise. i.e: - * - * - successful insertion of probes - * - (or) no possible probes to be inserted. - * - (or) insertion of probes failed but we can bail-out. + * Currently we ignore all errors and always return 0, the callers + * can't handle the failure anyway. */ int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; struct uprobe *uprobe, *u; struct inode *inode; - int ret, count; if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) return 0; @@ -1036,44 +1025,35 @@ int uprobe_mmap(struct vm_area_struct *vma) mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); - ret = 0; - count = 0; - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - if (!ret) { + if (!fatal_signal_pending(current)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - - ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); - /* - * We can race against uprobe_register(), see the - * comment near uprobe_hash(). - */ - if (ret == -EEXIST) { - ret = 0; - - if (!is_swbp_at_addr(vma->vm_mm, vaddr)) - continue; - - /* - * Unable to insert a breakpoint, but - * breakpoint lies underneath. Increment the - * probe count. - */ - atomic_inc(&vma->vm_mm->uprobes_state.count); - } - - if (!ret) - count++; + install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } put_uprobe(uprobe); } - mutex_unlock(uprobes_mmap_hash(inode)); - if (ret) - atomic_sub(count, &vma->vm_mm->uprobes_state.count); + return 0; +} - return ret; +static bool +vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + loff_t min, max; + struct inode *inode; + struct rb_node *n; + + inode = vma->vm_file->f_mapping->host; + + min = vaddr_to_offset(vma, start); + max = min + (end - start) - 1; + + spin_lock(&uprobes_treelock); + n = find_node_in_range(inode, min, max); + spin_unlock(&uprobes_treelock); + + return !!n; } /* @@ -1081,37 +1061,18 @@ int uprobe_mmap(struct vm_area_struct *vma) */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - struct list_head tmp_list; - struct uprobe *uprobe, *u; - struct inode *inode; - if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; - if (!atomic_read(&vma->vm_mm->uprobes_state.count)) - return; - - inode = vma->vm_file->f_mapping->host; - if (!inode) + if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || + test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) return; - mutex_lock(uprobes_mmap_hash(inode)); - build_probe_list(inode, vma, start, end, &tmp_list); - - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - /* - * An unregister could have removed the probe before - * unmap. So check before we decrement the count. - */ - if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) - atomic_dec(&vma->vm_mm->uprobes_state.count); - put_uprobe(uprobe); - } - mutex_unlock(uprobes_mmap_hash(inode)); + if (vma_has_uprobes(vma, start, end)) + set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); } /* Slot allocation for XOL */ @@ -1213,13 +1174,15 @@ void uprobe_clear_state(struct mm_struct *mm) kfree(area); } -/* - * uprobe_reset_state - Free the area allocated for slots. - */ -void uprobe_reset_state(struct mm_struct *mm) +void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { - mm->uprobes_state.xol_area = NULL; - atomic_set(&mm->uprobes_state.count, 0); + newmm->uprobes_state.xol_area = NULL; + + if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { + set_bit(MMF_HAS_UPROBES, &newmm->flags); + /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ + set_bit(MMF_RECALC_UPROBES, &newmm->flags); + } } /* @@ -1437,6 +1400,25 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) return false; } +static void mmf_recalc_uprobes(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!valid_vma(vma, false)) + continue; + /* + * This is not strictly accurate, we can race with + * uprobe_unregister() and see the already removed + * uprobe if delete_uprobe() was not yet called. + */ + if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) + return; + } + + clear_bit(MMF_HAS_UPROBES, &mm->flags); +} + static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; @@ -1458,11 +1440,24 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) } else { *is_swbp = -EFAULT; } + + if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) + mmf_recalc_uprobes(mm); up_read(&mm->mmap_sem); return uprobe; } +void __weak arch_uprobe_enable_step(struct arch_uprobe *arch) +{ + user_enable_single_step(current); +} + +void __weak arch_uprobe_disable_step(struct arch_uprobe *arch) +{ + user_disable_single_step(current); +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1509,7 +1504,7 @@ static void handle_swbp(struct pt_regs *regs) utask->state = UTASK_SSTEP; if (!pre_ssout(uprobe, regs, bp_vaddr)) { - user_enable_single_step(current); + arch_uprobe_enable_step(&uprobe->arch); return; } @@ -1518,17 +1513,15 @@ cleanup_ret: utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; } - if (uprobe) { - if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) + if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) - /* - * cannot singlestep; cannot skip instruction; - * re-execute the instruction. - */ - instruction_pointer_set(regs, bp_vaddr); + /* + * cannot singlestep; cannot skip instruction; + * re-execute the instruction. + */ + instruction_pointer_set(regs, bp_vaddr); - put_uprobe(uprobe); - } + put_uprobe(uprobe); } /* @@ -1547,10 +1540,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) else WARN_ON_ONCE(1); + arch_uprobe_disable_step(&uprobe->arch); put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; - user_disable_single_step(current); xol_free_insn_slot(current); spin_lock_irq(¤t->sighand->siglock); @@ -1589,8 +1582,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { struct uprobe_task *utask; - if (!current->mm || !atomic_read(¤t->mm->uprobes_state.count)) - /* task is currently not uprobed */ + if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) return 0; utask = current->utask; diff --git a/kernel/exit.c b/kernel/exit.c index f65345f9e5bb..346616c0092c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -457,108 +457,13 @@ void daemonize(const char *name, ...) /* Become as one with the init task */ daemonize_fs_struct(); - exit_files(current); - current->files = init_task.files; - atomic_inc(¤t->files->count); + daemonize_descriptors(); reparent_to_kthreadd(); } EXPORT_SYMBOL(daemonize); -static void close_files(struct files_struct * files) -{ - int i, j; - struct fdtable *fdt; - - j = 0; - - /* - * It is safe to dereference the fd table without RCU or - * ->file_lock because this is the last reference to the - * files structure. But use RCU to shut RCU-lockdep up. - */ - rcu_read_lock(); - fdt = files_fdtable(files); - rcu_read_unlock(); - for (;;) { - unsigned long set; - i = j * BITS_PER_LONG; - if (i >= fdt->max_fds) - break; - set = fdt->open_fds[j++]; - while (set) { - if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); - if (file) { - filp_close(file, files); - cond_resched(); - } - } - i++; - set >>= 1; - } - } -} - -struct files_struct *get_files_struct(struct task_struct *task) -{ - struct files_struct *files; - - task_lock(task); - files = task->files; - if (files) - atomic_inc(&files->count); - task_unlock(task); - - return files; -} - -void put_files_struct(struct files_struct *files) -{ - struct fdtable *fdt; - - if (atomic_dec_and_test(&files->count)) { - close_files(files); - /* - * Free the fd and fdset arrays if we expanded them. - * If the fdtable was embedded, pass files for freeing - * at the end of the RCU grace period. Otherwise, - * you can free files immediately. - */ - rcu_read_lock(); - fdt = files_fdtable(files); - if (fdt != &files->fdtab) - kmem_cache_free(files_cachep, files); - free_fdtable(fdt); - rcu_read_unlock(); - } -} - -void reset_files_struct(struct files_struct *files) -{ - struct task_struct *tsk = current; - struct files_struct *old; - - old = tsk->files; - task_lock(tsk); - tsk->files = files; - task_unlock(tsk); - put_files_struct(old); -} - -void exit_files(struct task_struct *tsk) -{ - struct files_struct * files = tsk->files; - - if (files) { - task_lock(tsk); - tsk->files = NULL; - task_unlock(tsk); - put_files_struct(files); - } -} - #ifdef CONFIG_MM_OWNER /* * A task is exiting. If it owned this mm, find a new owner for the mm. @@ -1046,6 +951,9 @@ void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); + if (tsk->task_frag.page) + put_page(tsk->task_frag.page); + validate_creds_for_do_exit(tsk); preempt_disable(); diff --git a/kernel/fork.c b/kernel/fork.c index 2c8857e12855..8b20ab7d3aa2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -330,6 +330,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; + tsk->task_frag.page = NULL; account_kernel_stack(ti, 1); @@ -353,6 +354,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ @@ -421,7 +423,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - vma_prio_tree_add(tmp, mpnt); + if (unlikely(tmp->vm_flags & VM_NONLINEAR)) + vma_nonlinear_insert(tmp, + &mapping->i_mmap_nonlinear); + else + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -454,9 +461,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; - - if (file) - uprobe_mmap(tmp); } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); @@ -623,26 +627,6 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -/* - * We added or removed a vma mapping the executable. The vmas are only mapped - * during exec and are not mapped with the mmap system call. - * Callers must hold down_write() on the mm's mmap_sem for these - */ -void added_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas++; -} - -void removed_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas--; - if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { - fput(mm->exe_file); - mm->exe_file = NULL; - } - -} - void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { if (new_exe_file) @@ -650,15 +634,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) if (mm->exe_file) fput(mm->exe_file); mm->exe_file = new_exe_file; - mm->num_exe_file_vmas = 0; } struct file *get_mm_exe_file(struct mm_struct *mm) { struct file *exe_file; - /* We need mmap_sem to protect against races with removal of - * VM_EXECUTABLE vmas */ + /* We need mmap_sem to protect against races with removal of exe_file */ down_read(&mm->mmap_sem); exe_file = mm->exe_file; if (exe_file) @@ -839,8 +821,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif - uprobe_reset_state(mm); - if (!mm_init(mm, tsk)) goto fail_nomem; @@ -1081,7 +1061,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_rwsem(&sig->group_rwsem); #endif - sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -1280,11 +1259,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - p->hardirqs_enabled = 1; -#else p->hardirqs_enabled = 0; -#endif p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; @@ -1609,7 +1584,7 @@ long do_fork(unsigned long clone_flags, * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ - if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { + if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) @@ -1659,6 +1634,17 @@ long do_fork(unsigned long clone_flags, return nr; } +#ifdef CONFIG_GENERIC_KERNEL_THREAD +/* + * Create a kernel thread. + */ +pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL, + (unsigned long)arg, NULL, NULL); +} +#endif + #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index eebd6d5cfb44..57d86d07221e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -671,6 +671,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_set_chip(irq, chip); __irq_set_handler(irq, handle, 0, name); } +EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index b5fcd96c7102..988dc58e8847 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -6,6 +6,7 @@ */ #include <linux/interrupt.h> #include <linux/irq.h> +#include <linux/export.h> #include "internals.h" @@ -57,3 +58,4 @@ struct irq_chip dummy_irq_chip = { .irq_mask = noop, .irq_unmask = noop, }; +EXPORT_SYMBOL_GPL(dummy_irq_chip); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 49a77727db42..4e69e24d3d7d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -148,7 +148,8 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, * @host_data: Controller private data pointer * * Allocates a legacy irq_domain if irq_base is positive or a linear - * domain otherwise. + * domain otherwise. For the legacy domain, IRQ descriptors will also + * be allocated. * * This is intended to implement the expected behaviour for most * interrupt controllers which is that a linear mapping should @@ -162,11 +163,33 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - if (first_irq > 0) - return irq_domain_add_legacy(of_node, size, first_irq, 0, + if (first_irq > 0) { + int irq_base; + + if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { + /* + * Set the descriptor allocator to search for a + * 1-to-1 mapping, such as irq_alloc_desc_at(). + * Use of_node_to_nid() which is defined to + * numa_node_id() on platforms that have no custom + * implementation. + */ + irq_base = irq_alloc_descs(first_irq, first_irq, size, + of_node_to_nid(of_node)); + if (irq_base < 0) { + WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", + first_irq); + irq_base = first_irq; + } + } else + irq_base = first_irq; + + return irq_domain_add_legacy(of_node, size, irq_base, 0, ops, host_data); - else - return irq_domain_add_linear(of_node, size, ops, host_data); + } + + /* A linear domain is the default */ + return irq_domain_add_linear(of_node, size, ops, host_data); } /** diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 43049192b5ec..60f48fa0fd0d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key, key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } +EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { diff --git a/kernel/kexec.c b/kernel/kexec.c index 0668d58d6413..5e4bd7864c5d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -21,7 +21,6 @@ #include <linux/hardirq.h> #include <linux/elf.h> #include <linux/elfcore.h> -#include <generated/utsrelease.h> #include <linux/utsname.h> #include <linux/numa.h> #include <linux/suspend.h> diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c62b8546cc90..098f396aa409 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -561,9 +561,9 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) { LIST_HEAD(free_list); + mutex_lock(&kprobe_mutex); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); - mutex_lock(&kprobe_mutex); /* * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) @@ -586,8 +586,8 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) /* Step 4: Free cleaned kprobes after quiesence period */ do_free_cleaned_kprobes(&free_list); - mutex_unlock(&kprobe_mutex); mutex_unlock(&module_mutex); + mutex_unlock(&kprobe_mutex); /* Step 5: Kick optimizer again if needed */ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) @@ -759,20 +759,32 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) struct kprobe *ap; struct optimized_kprobe *op; + /* Impossible to optimize ftrace-based kprobe */ + if (kprobe_ftrace(p)) + return; + + /* For preparing optimization, jump_label_text_reserved() is called */ + jump_label_lock(); + mutex_lock(&text_mutex); + ap = alloc_aggr_kprobe(p); if (!ap) - return; + goto out; op = container_of(ap, struct optimized_kprobe, kp); if (!arch_prepared_optinsn(&op->optinsn)) { /* If failed to setup optimizing, fallback to kprobe */ arch_remove_optimized_kprobe(op); kfree(op); - return; + goto out; } init_aggr_kprobe(ap, p); - optimize_kprobe(ap); + optimize_kprobe(ap); /* This just kicks optimizer thread */ + +out: + mutex_unlock(&text_mutex); + jump_label_unlock(); } #ifdef CONFIG_SYSCTL @@ -907,9 +919,64 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) } #endif /* CONFIG_OPTPROBES */ +#ifdef KPROBES_CAN_USE_FTRACE +static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { + .func = kprobe_ftrace_handler, + .flags = FTRACE_OPS_FL_SAVE_REGS, +}; +static int kprobe_ftrace_enabled; + +/* Must ensure p->addr is really on ftrace */ +static int __kprobes prepare_kprobe(struct kprobe *p) +{ + if (!kprobe_ftrace(p)) + return arch_prepare_kprobe(p); + + return arch_prepare_kprobe_ftrace(p); +} + +/* Caller must lock kprobe_mutex */ +static void __kprobes arm_kprobe_ftrace(struct kprobe *p) +{ + int ret; + + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, + (unsigned long)p->addr, 0, 0); + WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); + kprobe_ftrace_enabled++; + if (kprobe_ftrace_enabled == 1) { + ret = register_ftrace_function(&kprobe_ftrace_ops); + WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + } +} + +/* Caller must lock kprobe_mutex */ +static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) +{ + int ret; + + kprobe_ftrace_enabled--; + if (kprobe_ftrace_enabled == 0) { + ret = unregister_ftrace_function(&kprobe_ftrace_ops); + WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + } + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, + (unsigned long)p->addr, 1, 0); + WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); +} +#else /* !KPROBES_CAN_USE_FTRACE */ +#define prepare_kprobe(p) arch_prepare_kprobe(p) +#define arm_kprobe_ftrace(p) do {} while (0) +#define disarm_kprobe_ftrace(p) do {} while (0) +#endif + /* Arm a kprobe with text_mutex */ static void __kprobes arm_kprobe(struct kprobe *kp) { + if (unlikely(kprobe_ftrace(kp))) { + arm_kprobe_ftrace(kp); + return; + } /* * Here, since __arm_kprobe() doesn't use stop_machine(), * this doesn't cause deadlock on text_mutex. So, we don't @@ -921,11 +988,15 @@ static void __kprobes arm_kprobe(struct kprobe *kp) } /* Disarm a kprobe with text_mutex */ -static void __kprobes disarm_kprobe(struct kprobe *kp) +static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) { + if (unlikely(kprobe_ftrace(kp))) { + disarm_kprobe_ftrace(kp); + return; + } /* Ditto */ mutex_lock(&text_mutex); - __disarm_kprobe(kp, true); + __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); } @@ -1144,12 +1215,6 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) if (p->post_handler && !ap->post_handler) ap->post_handler = aggr_post_handler; - if (kprobe_disabled(ap) && !kprobe_disabled(p)) { - ap->flags &= ~KPROBE_FLAG_DISABLED; - if (!kprobes_all_disarmed) - /* Arm the breakpoint again. */ - __arm_kprobe(ap); - } return 0; } @@ -1189,11 +1254,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, int ret = 0; struct kprobe *ap = orig_p; + /* For preparing optimization, jump_label_text_reserved() is called */ + jump_label_lock(); + /* + * Get online CPUs to avoid text_mutex deadlock.with stop machine, + * which is invoked by unoptimize_kprobe() in add_new_kprobe() + */ + get_online_cpus(); + mutex_lock(&text_mutex); + if (!kprobe_aggrprobe(orig_p)) { /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ ap = alloc_aggr_kprobe(orig_p); - if (!ap) - return -ENOMEM; + if (!ap) { + ret = -ENOMEM; + goto out; + } init_aggr_kprobe(ap, orig_p); } else if (kprobe_unused(ap)) /* This probe is going to die. Rescue it */ @@ -1213,7 +1289,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, * free aggr_probe. It will be used next time, or * freed by unregister_kprobe. */ - return ret; + goto out; /* Prepare optimized instructions if possible. */ prepare_optimized_kprobe(ap); @@ -1228,7 +1304,20 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, /* Copy ap's insn slot to p */ copy_kprobe(ap, p); - return add_new_kprobe(ap, p); + ret = add_new_kprobe(ap, p); + +out: + mutex_unlock(&text_mutex); + put_online_cpus(); + jump_label_unlock(); + + if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { + ap->flags &= ~KPROBE_FLAG_DISABLED; + if (!kprobes_all_disarmed) + /* Arm the breakpoint again. */ + arm_kprobe(ap); + } + return ret; } static int __kprobes in_kprobes_functions(unsigned long addr) @@ -1313,71 +1402,96 @@ static inline int check_kprobe_rereg(struct kprobe *p) return ret; } -int __kprobes register_kprobe(struct kprobe *p) +static __kprobes int check_kprobe_address_safe(struct kprobe *p, + struct module **probed_mod) { int ret = 0; - struct kprobe *old_p; - struct module *probed_mod; - kprobe_opcode_t *addr; - - addr = kprobe_addr(p); - if (IS_ERR(addr)) - return PTR_ERR(addr); - p->addr = addr; + unsigned long ftrace_addr; - ret = check_kprobe_rereg(p); - if (ret) - return ret; + /* + * If the address is located on a ftrace nop, set the + * breakpoint to the following instruction. + */ + ftrace_addr = ftrace_location((unsigned long)p->addr); + if (ftrace_addr) { +#ifdef KPROBES_CAN_USE_FTRACE + /* Given address is not on the instruction boundary */ + if ((unsigned long)p->addr != ftrace_addr) + return -EILSEQ; + p->flags |= KPROBE_FLAG_FTRACE; +#else /* !KPROBES_CAN_USE_FTRACE */ + return -EINVAL; +#endif + } jump_label_lock(); preempt_disable(); + + /* Ensure it is not in reserved area nor out of text */ if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || - ftrace_text_reserved(p->addr, p->addr) || jump_label_text_reserved(p->addr, p->addr)) { ret = -EINVAL; - goto cannot_probe; + goto out; } - /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ - p->flags &= KPROBE_FLAG_DISABLED; - - /* - * Check if are we probing a module. - */ - probed_mod = __module_text_address((unsigned long) p->addr); - if (probed_mod) { - /* Return -ENOENT if fail. */ - ret = -ENOENT; + /* Check if are we probing a module */ + *probed_mod = __module_text_address((unsigned long) p->addr); + if (*probed_mod) { /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. */ - if (unlikely(!try_module_get(probed_mod))) - goto cannot_probe; + if (unlikely(!try_module_get(*probed_mod))) { + ret = -ENOENT; + goto out; + } /* * If the module freed .init.text, we couldn't insert * kprobes in there. */ - if (within_module_init((unsigned long)p->addr, probed_mod) && - probed_mod->state != MODULE_STATE_COMING) { - module_put(probed_mod); - goto cannot_probe; + if (within_module_init((unsigned long)p->addr, *probed_mod) && + (*probed_mod)->state != MODULE_STATE_COMING) { + module_put(*probed_mod); + *probed_mod = NULL; + ret = -ENOENT; } - /* ret will be updated by following code */ } +out: preempt_enable(); jump_label_unlock(); + return ret; +} + +int __kprobes register_kprobe(struct kprobe *p) +{ + int ret; + struct kprobe *old_p; + struct module *probed_mod; + kprobe_opcode_t *addr; + + /* Adjust probe address from symbol */ + addr = kprobe_addr(p); + if (IS_ERR(addr)) + return PTR_ERR(addr); + p->addr = addr; + + ret = check_kprobe_rereg(p); + if (ret) + return ret; + + /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ + p->flags &= KPROBE_FLAG_DISABLED; p->nmissed = 0; INIT_LIST_HEAD(&p->list); - mutex_lock(&kprobe_mutex); - jump_label_lock(); /* needed to call jump_label_text_reserved() */ + ret = check_kprobe_address_safe(p, &probed_mod); + if (ret) + return ret; - get_online_cpus(); /* For avoiding text_mutex deadlock. */ - mutex_lock(&text_mutex); + mutex_lock(&kprobe_mutex); old_p = get_kprobe(p->addr); if (old_p) { @@ -1386,7 +1500,9 @@ int __kprobes register_kprobe(struct kprobe *p) goto out; } - ret = arch_prepare_kprobe(p); + mutex_lock(&text_mutex); /* Avoiding text modification */ + ret = prepare_kprobe(p); + mutex_unlock(&text_mutex); if (ret) goto out; @@ -1395,26 +1511,18 @@ int __kprobes register_kprobe(struct kprobe *p) &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); if (!kprobes_all_disarmed && !kprobe_disabled(p)) - __arm_kprobe(p); + arm_kprobe(p); /* Try to optimize kprobe */ try_to_optimize_kprobe(p); out: - mutex_unlock(&text_mutex); - put_online_cpus(); - jump_label_unlock(); mutex_unlock(&kprobe_mutex); if (probed_mod) module_put(probed_mod); return ret; - -cannot_probe: - preempt_enable(); - jump_label_unlock(); - return ret; } EXPORT_SYMBOL_GPL(register_kprobe); @@ -1451,7 +1559,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { - disarm_kprobe(orig_p); + disarm_kprobe(orig_p, true); orig_p->flags |= KPROBE_FLAG_DISABLED; } } @@ -2049,10 +2157,11 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, if (!pp) pp = p; - seq_printf(pi, "%s%s%s\n", + seq_printf(pi, "%s%s%s%s\n", (kprobe_gone(p) ? "[GONE]" : ""), ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), - (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); + (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""), + (kprobe_ftrace(pp) ? "[FTRACE]" : "")); } static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) @@ -2131,14 +2240,12 @@ static void __kprobes arm_all_kprobes(void) goto already_enabled; /* Arming kprobes doesn't optimize kprobe itself */ - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) if (!kprobe_disabled(p)) - __arm_kprobe(p); + arm_kprobe(p); } - mutex_unlock(&text_mutex); kprobes_all_disarmed = false; printk(KERN_INFO "Kprobes globally enabled\n"); @@ -2166,15 +2273,13 @@ static void __kprobes disarm_all_kprobes(void) kprobes_all_disarmed = true; printk(KERN_INFO "Kprobes globally disabled\n"); - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - __disarm_kprobe(p, false); + disarm_kprobe(p, false); } } - mutex_unlock(&text_mutex); mutex_unlock(&kprobe_mutex); /* Wait for disarming all kprobes by optimizer */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index ea9ee4518c35..7981e5b2350d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2998,6 +2998,42 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); struct lock_class_key __lockdep_no_validate__; +static int +print_lock_nested_lock_not_held(struct task_struct *curr, + struct held_lock *hlock, + unsigned long ip) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + printk("\n"); + printk("==================================\n"); + printk("[ BUG: Nested lock was not taken ]\n"); + print_kernel_ident(); + printk("----------------------------------\n"); + + printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + print_lock(hlock); + + printk("\nbut this task is not holding:\n"); + printk("%s\n", hlock->nest_lock->name); + + printk("\nstack backtrace:\n"); + dump_stack(); + + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static int __lock_is_held(struct lockdep_map *lock); + /* * This gets called for every mutex_lock*()/spin_lock*() operation. * We maintain the dependency maps and validate the locking attempt: @@ -3139,6 +3175,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } chain_key = iterate_chain_key(chain_key, id); + if (nest_lock && !__lock_is_held(nest_lock)) + return print_lock_nested_lock_not_held(curr, hlock, ip); + if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; diff --git a/kernel/pid.c b/kernel/pid.c index e86b291ad834..aebd4f5aaf41 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -479,6 +479,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) } return nr; } +EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6144bab8fd8e..478bad2745e3 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/reboot.h> +#include <linux/export.h> #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -144,6 +145,7 @@ void free_pid_ns(struct kref *kref) if (parent != NULL) put_pid_ns(parent); } +EXPORT_SYMBOL_GPL(free_pid_ns); void zap_pid_ns_processes(struct pid_namespace *pid_ns) { diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a70518c9d82f..5dfdc9ea180b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -263,6 +263,10 @@ config PM_GENERIC_DOMAINS bool depends on PM +config PM_GENERIC_DOMAINS_SLEEP + def_bool y + depends on PM_SLEEP && PM_GENERIC_DOMAINS + config PM_GENERIC_DOMAINS_RUNTIME def_bool y depends on PM_RUNTIME && PM_GENERIC_DOMAINS diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index d52359374e85..68197a4e8fc9 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -37,7 +37,7 @@ static struct sysrq_key_op sysrq_poweroff_op = { .enable_mask = SYSRQ_ENABLE_BOOT, }; -static int pm_sysrq_init(void) +static int __init pm_sysrq_init(void) { register_sysrq_key('o', &sysrq_poweroff_op); return 0; diff --git a/kernel/power/process.c b/kernel/power/process.c index 19db29f67558..87da817f9e13 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -79,7 +79,7 @@ static int try_to_freeze_tasks(bool user_only) /* * We need to retry, but first give the freezing tasks some - * time to enter the regrigerator. + * time to enter the refrigerator. */ msleep(10); } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 6a031e684026..846bd42c7ed1 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -139,6 +139,7 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) default: /* runtime check for not using enum */ BUG(); + return PM_QOS_DEFAULT_VALUE; } } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a232bb59d93f..1f5e55dda955 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -180,7 +180,8 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) return has_ns_capability(current, ns, CAP_SYS_PTRACE); } -int __ptrace_may_access(struct task_struct *task, unsigned int mode) +/* Returns 0 on success, -errno on denial. */ +static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; diff --git a/kernel/resource.c b/kernel/resource.c index 34d45886ee84..73f35d4b30b9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct resource *root, struct resource *parent = root; struct resource *conflict; struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); + struct resource *next_res = NULL; if (!res) return; @@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct resource *root, res->end = end; res->flags = IORESOURCE_BUSY; - conflict = __request_resource(parent, res); - if (!conflict) - return; + while (1) { - /* failed, split and try again */ - kfree(res); + conflict = __request_resource(parent, res); + if (!conflict) { + if (!next_res) + break; + res = next_res; + next_res = NULL; + continue; + } - /* conflict covered whole area */ - if (conflict->start <= start && conflict->end >= end) - return; + /* conflict covered whole area */ + if (conflict->start <= res->start && + conflict->end >= res->end) { + kfree(res); + WARN_ON(next_res); + break; + } + + /* failed, split and try again */ + if (conflict->start > res->start) { + end = res->end; + res->end = conflict->start - 1; + if (conflict->end < end) { + next_res = kzalloc(sizeof(*next_res), + GFP_ATOMIC); + if (!next_res) { + kfree(res); + break; + } + next_res->name = name; + next_res->start = conflict->end + 1; + next_res->end = end; + next_res->flags = IORESOURCE_BUSY; + } + } else { + res->start = conflict->end + 1; + } + } - if (conflict->start > start) - __reserve_region_with_split(root, start, conflict->start-1, name); - if (conflict->end < end) - __reserve_region_with_split(root, conflict->end+1, end, name); } void __init reserve_region_with_split(struct resource *root, diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 173ea52f3af0..f06d249e103b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o +obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c4dec0594d6..bd7c39450b1b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -505,7 +505,7 @@ static inline void init_hrtick(void) #ifdef CONFIG_SMP #ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#define tsk_is_polling(t) 0 #endif void resched_task(struct task_struct *p) @@ -740,126 +740,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - -/* - * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU - * with interrupts disabled. So, writes are safe. - * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old - * or new value with a side effect of accounting a slice of irq time to wrong - * task when irq is in progress while we read rq->clock. That is a worthy - * compromise in place of having locks on each irq in account_system_time. - */ -static DEFINE_PER_CPU(u64, cpu_hardirq_time); -static DEFINE_PER_CPU(u64, cpu_softirq_time); - -static DEFINE_PER_CPU(u64, irq_start_time); -static int sched_clock_irqtime; - -void enable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 1; -} - -void disable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 0; -} - -#ifndef CONFIG_64BIT -static DEFINE_PER_CPU(seqcount_t, irq_time_seq); - -static inline void irq_time_write_begin(void) -{ - __this_cpu_inc(irq_time_seq.sequence); - smp_wmb(); -} - -static inline void irq_time_write_end(void) -{ - smp_wmb(); - __this_cpu_inc(irq_time_seq.sequence); -} - -static inline u64 irq_time_read(int cpu) -{ - u64 irq_time; - unsigned seq; - - do { - seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); - irq_time = per_cpu(cpu_softirq_time, cpu) + - per_cpu(cpu_hardirq_time, cpu); - } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); - - return irq_time; -} -#else /* CONFIG_64BIT */ -static inline void irq_time_write_begin(void) -{ -} - -static inline void irq_time_write_end(void) -{ -} - -static inline u64 irq_time_read(int cpu) -{ - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); -} -#endif /* CONFIG_64BIT */ - -/* - * Called before incrementing preempt_count on {soft,}irq_enter - * and before decrementing preempt_count on {soft,}irq_exit. - */ -void account_system_vtime(struct task_struct *curr) -{ - unsigned long flags; - s64 delta; - int cpu; - - if (!sched_clock_irqtime) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); - __this_cpu_add(irq_start_time, delta); - - irq_time_write_begin(); - /* - * We do not account for softirq time from ksoftirqd here. - * We want to continue accounting softirq time to ksoftirqd thread - * in that case, so as not to confuse scheduler with a special task - * that do not consume any time, but still wants to run. - */ - if (hardirq_count()) - __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); - - irq_time_write_end(); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(account_system_vtime); - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - static void update_rq_clock_task(struct rq *rq, s64 delta) { /* @@ -920,43 +800,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #endif } -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -static int irqtime_account_hi_update(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) - ret = 1; - local_irq_restore(flags); - return ret; -} - -static int irqtime_account_si_update(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) - ret = 1; - local_irq_restore(flags); - return ret; -} - -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#define sched_clock_irqtime (0) - -#endif - void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -1518,25 +1361,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) smp_send_reschedule(cpu); } -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -static int ttwu_activate_remote(struct task_struct *p, int wake_flags) -{ - struct rq *rq; - int ret = 0; - - rq = __task_rq_lock(p); - if (p->on_cpu) { - ttwu_activate(rq, p, ENQUEUE_WAKEUP); - ttwu_do_wakeup(rq, p, wake_flags); - ret = 1; - } - __task_rq_unlock(rq); - - return ret; - -} -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ - bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); @@ -1597,21 +1421,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * If the owning (remote) cpu is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. */ - while (p->on_cpu) { -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - /* - * In case the architecture enables interrupts in - * context_switch(), we cannot busy wait, since that - * would lead to deadlocks when an interrupt hits and - * tries to wake up @prev. So bail and do a complete - * remote wakeup. - */ - if (ttwu_activate_remote(p, wake_flags)) - goto stat; -#else + while (p->on_cpu) cpu_relax(); -#endif - } /* * Pairs with the smp_wmb() in finish_lock_switch(). */ @@ -1953,14 +1764,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * Manfred Spraul <manfred@colorfullife.com> */ prev_state = prev->state; + vtime_task_switch(prev); finish_arch_switch(prev); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_disable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ perf_event_task_sched_in(prev, current); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -2810,404 +2616,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_CGROUP_CPUACCT -struct cgroup_subsys cpuacct_subsys; -struct cpuacct root_cpuacct; -#endif - -static inline void task_group_account_field(struct task_struct *p, int index, - u64 tmp) -{ -#ifdef CONFIG_CGROUP_CPUACCT - struct kernel_cpustat *kcpustat; - struct cpuacct *ca; -#endif - /* - * Since all updates are sure to touch the root cgroup, we - * get ourselves ahead and touch it first. If the root cgroup - * is the only cgroup, then nothing else should be necessary. - * - */ - __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; - -#ifdef CONFIG_CGROUP_CPUACCT - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += tmp; - ca = parent_ca(ca); - } - rcu_read_unlock(); -#endif -} - - -/* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - int index; - - /* Add user time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; - account_group_user_time(p, cputime); - - index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); - - /* Account for user time used */ - acct_update_integrals(p); -} - -/* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - /* Add guest time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; - account_group_user_time(p, cputime); - p->gtime += cputime; - - /* Add guest time to cpustat. */ - if (TASK_NICE(p) > 0) { - cpustat[CPUTIME_NICE] += (__force u64) cputime; - cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; - } else { - cpustat[CPUTIME_USER] += (__force u64) cputime; - cpustat[CPUTIME_GUEST] += (__force u64) cputime; - } -} - -/* - * Account system cpu time to a process and desired cpustat field - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * @target_cputime64: pointer to cpustat field that has to be updated - */ -static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, int index) -{ - /* Add system time to process. */ - p->stime += cputime; - p->stimescaled += cputime_scaled; - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); - - /* Account for system time used */ - acct_update_integrals(p); -} - -/* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) -{ - int index; - - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime, cputime_scaled); - return; - } - - if (hardirq_count() - hardirq_offset) - index = CPUTIME_IRQ; - else if (in_serving_softirq()) - index = CPUTIME_SOFTIRQ; - else - index = CPUTIME_SYSTEM; - - __account_system_time(p, cputime, cputime_scaled, index); -} - -/* - * Account for involuntary wait time. - * @cputime: the cpu time spent in involuntary wait - */ -void account_steal_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - cpustat[CPUTIME_STEAL] += (__force u64) cputime; -} - -/* - * Account for idle time. - * @cputime: the cpu time spent in idle wait - */ -void account_idle_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - struct rq *rq = this_rq(); - - if (atomic_read(&rq->nr_iowait) > 0) - cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; - else - cpustat[CPUTIME_IDLE] += (__force u64) cputime; -} - -static __always_inline bool steal_account_process_tick(void) -{ -#ifdef CONFIG_PARAVIRT - if (static_key_false(¶virt_steal_enabled)) { - u64 steal, st = 0; - - steal = paravirt_steal_clock(smp_processor_id()); - steal -= this_rq()->prev_steal_time; - - st = steal_ticks(steal); - this_rq()->prev_steal_time += st * TICK_NSEC; - - account_steal_time(st); - return st; - } -#endif - return false; -} - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -/* - * Account a tick to a process and cpustat - * @p: the process that the cpu time gets accounted to - * @user_tick: is the tick from userspace - * @rq: the pointer to rq - * - * Tick demultiplexing follows the order - * - pending hardirq update - * - pending softirq update - * - user_time - * - idle_time - * - system time - * - check for guest_time - * - else account as system_time - * - * Check for hardirq is done both for system and user time as there is - * no timer going off while we are on hardirq and hence we may never get an - * opportunity to update it solely in system time. - * p->stime and friends are only updated on system time and not on irq - * softirq as those do not count in task exec_runtime any more. - */ -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) -{ - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - u64 *cpustat = kcpustat_this_cpu->cpustat; - - if (steal_account_process_tick()) - return; - - if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; - } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; - } else if (this_cpu_ksoftirqd() == p) { - /* - * ksoftirqd time do not get accounted in cpu_softirq_time. - * So, we have to handle it separately here. - * Also, p->stime needs to be updated for ksoftirqd. - */ - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SOFTIRQ); - } else if (user_tick) { - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); - } else if (p == rq->idle) { - account_idle_time(cputime_one_jiffy); - } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); - } else { - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SYSTEM); - } -} - -static void irqtime_account_idle_ticks(int ticks) -{ - int i; - struct rq *rq = this_rq(); - - for (i = 0; i < ticks; i++) - irqtime_account_process_tick(current, 0, rq); -} -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static void irqtime_account_idle_ticks(int ticks) {} -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) {} -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -/* - * Account a single tick of cpu time. - * @p: the process that the cpu time gets accounted to - * @user_tick: indicates if the tick is a user or a system tick - */ -void account_process_tick(struct task_struct *p, int user_tick) -{ - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - struct rq *rq = this_rq(); - - if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq); - return; - } - - if (steal_account_process_tick()) - return; - - if (user_tick) - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); - else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, - one_jiffy_scaled); - else - account_idle_time(cputime_one_jiffy); -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); -} - -/* - * Account multiple ticks of idle time. - * @ticks: number of stolen ticks - */ -void account_idle_ticks(unsigned long ticks) -{ - - if (sched_clock_irqtime) { - irqtime_account_idle_ticks(ticks); - return; - } - - account_idle_time(jiffies_to_cputime(ticks)); -} - -#endif - -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - *ut = p->utime; - *st = p->stime; -} - -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - - *ut = cputime.utime; - *st = cputime.stime; -} -#else - -#ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -#endif - -static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) -{ - u64 temp = (__force u64) rtime; - - temp *= (__force u64) utime; - - if (sizeof(cputime_t) == 4) - temp = div_u64(temp, (__force u32) total); - else - temp = div64_u64(temp, (__force u64) total); - - return (__force cputime_t) temp; -} - -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - cputime_t rtime, utime = p->utime, total = utime + p->stime; - - /* - * Use CFS's precise accounting: - */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); - - if (total) - utime = scale_utime(utime, rtime, total); - else - utime = rtime; - - /* - * Compare with previous values, to keep monotonicity: - */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); - - *ut = p->prev_utime; - *st = p->prev_stime; -} - -/* - * Must be called with siglock held. - */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct signal_struct *sig = p->signal; - struct task_cputime cputime; - cputime_t rtime, utime, total; - - thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) - utime = scale_utime(cputime.utime, rtime, total); - else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); - - *ut = sig->prev_utime; - *st = sig->prev_stime; -} -#endif - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3368,6 +2776,40 @@ pick_next_task(struct rq *rq) /* * __schedule() is the main scheduler function. + * + * The main means of driving the scheduler and thus entering this function are: + * + * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. + * + * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return + * paths. For example, see arch/x86/entry_64.S. + * + * To drive preemption between tasks, the scheduler sets the flag in timer + * interrupt handler scheduler_tick(). + * + * 3. Wakeups don't really cause entry into schedule(). They add a + * task to the run-queue and that's it. + * + * Now, if the new task added to the run-queue preempts the current + * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets + * called on the nearest possible occasion: + * + * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * + * - in syscall or exception context, at the next outmost + * preempt_enable(). (this might be as soon as the wake_up()'s + * spin_unlock()!) + * + * - in IRQ context, return from interrupt-handler to + * preemptible context + * + * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * then at the next: + * + * - cond_resched() call + * - explicit schedule() call + * - return from syscall or exception to user-space + * - return from interrupt-handler to user-space */ static void __sched __schedule(void) { @@ -4885,13 +4327,6 @@ again: */ if (preempt && rq != p_rq) resched_task(p_rq->curr); - } else { - /* - * We might have set it in task_yield_fair(), but are - * not going to schedule(), so don't want to skip - * the next update. - */ - rq->skip_clock_update = 0; } out: @@ -5433,16 +4868,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) *tablep = NULL; } +static int min_load_idx = 0; +static int max_load_idx = CPU_LOAD_IDX_MAX; + static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler) + umode_t mode, proc_handler *proc_handler, + bool load_idx) { entry->procname = procname; entry->data = data; entry->maxlen = maxlen; entry->mode = mode; entry->proc_handler = proc_handler; + + if (load_idx) { + entry->extra1 = &min_load_idx; + entry->extra2 = &max_load_idx; + } } static struct ctl_table * @@ -5454,30 +4898,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) return NULL; set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax); + sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax); + sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[9], "cache_nice_tries", &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[11], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring); + CORENAME_MAX_SIZE, 0444, proc_dostring, false); /* &table[12] is terminator */ return table; @@ -6556,7 +6000,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | 0*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE | 0*SD_WAKE_AFFINE - | 0*SD_PREFER_LOCAL | 0*SD_SHARE_CPUPOWER | 0*SD_SHARE_PKG_RESOURCES | 1*SD_SERIALIZE @@ -8354,6 +7797,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { * (balbir@in.ibm.com). */ +struct cpuacct root_cpuacct; + /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) { diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c new file mode 100644 index 000000000000..81b763ba58a6 --- /dev/null +++ b/kernel/sched/cputime.c @@ -0,0 +1,530 @@ +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/tsacct_kern.h> +#include <linux/kernel_stat.h> +#include <linux/static_key.h> +#include "sched.h" + + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in vtime_account, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/vtime_account on this CPU. We would either get old + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. + */ +DEFINE_PER_CPU(u64, cpu_hardirq_time); +DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +#ifndef CONFIG_64BIT +DEFINE_PER_CPU(seqcount_t, irq_time_seq); +#endif /* CONFIG_64BIT */ + +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +void vtime_account(struct task_struct *curr) +{ + unsigned long flags; + s64 delta; + int cpu; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + __this_cpu_add(irq_start_time, delta); + + irq_time_write_begin(); + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + __this_cpu_add(cpu_hardirq_time, delta); + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + __this_cpu_add(cpu_softirq_time, delta); + + irq_time_write_end(); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(vtime_account); + +static int irqtime_account_hi_update(void) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) + ret = 1; + local_irq_restore(flags); + return ret; +} + +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#define sched_clock_irqtime (0) + +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ + +static inline void task_group_account_field(struct task_struct *p, int index, + u64 tmp) +{ +#ifdef CONFIG_CGROUP_CPUACCT + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; +#endif + /* + * Since all updates are sure to touch the root cgroup, we + * get ourselves ahead and touch it first. If the root cgroup + * is the only cgroup, then nothing else should be necessary. + * + */ + __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + +#ifdef CONFIG_CGROUP_CPUACCT + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(p); + while (ca && (ca != &root_cpuacct)) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += tmp; + ca = parent_ca(ca); + } + rcu_read_unlock(); +#endif +} + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + int index; + + /* Add user time to process. */ + p->utime += cputime; + p->utimescaled += cputime_scaled; + account_group_user_time(p, cputime); + + index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, (__force u64) cputime); + + /* Account for user time used */ + acct_update_integrals(p); +} + +/* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + /* Add guest time to process. */ + p->utime += cputime; + p->utimescaled += cputime_scaled; + account_group_user_time(p, cputime); + p->gtime += cputime; + + /* Add guest time to cpustat. */ + if (TASK_NICE(p) > 0) { + cpustat[CPUTIME_NICE] += (__force u64) cputime; + cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; + } else { + cpustat[CPUTIME_USER] += (__force u64) cputime; + cpustat[CPUTIME_GUEST] += (__force u64) cputime; + } +} + +/* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, int index) +{ + /* Add system time to process. */ + p->stime += cputime; + p->stimescaled += cputime_scaled; + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + task_group_account_field(p, index, (__force u64) cputime); + + /* Account for system time used */ + acct_update_integrals(p); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime, cputime_t cputime_scaled) +{ + int index; + + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime, cputime_scaled); + return; + } + + if (hardirq_count() - hardirq_offset) + index = CPUTIME_IRQ; + else if (in_serving_softirq()) + index = CPUTIME_SOFTIRQ; + else + index = CPUTIME_SYSTEM; + + __account_system_time(p, cputime, cputime_scaled, index); +} + +/* + * Account for involuntary wait time. + * @cputime: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + cpustat[CPUTIME_STEAL] += (__force u64) cputime; +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; + else + cpustat[CPUTIME_IDLE] += (__force u64) cputime; +} + +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT + if (static_key_false(¶virt_steal_enabled)) { + u64 steal, st = 0; + + steal = paravirt_steal_clock(smp_processor_id()); + steal -= this_rq()->prev_steal_time; + + st = steal_ticks(steal); + this_rq()->prev_steal_time += st * TICK_NSEC; + + account_steal_time(st); + return st; + } +#endif + return false; +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + * - check for guest_time + * - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + + if (steal_account_process_tick()) + return; + + if (irqtime_account_hi_update()) { + cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; + } else if (irqtime_account_si_update()) { + cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; + } else if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + * Also, p->stime needs to be updated for ksoftirqd. + */ + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + CPUTIME_SOFTIRQ); + } else if (user_tick) { + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else if (p == rq->idle) { + account_idle_time(cputime_one_jiffy); + } else if (p->flags & PF_VCPU) { /* System time or guest time */ + account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else { + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + CPUTIME_SYSTEM); + } +} + +static void irqtime_account_idle_ticks(int ticks) +{ + int i; + struct rq *rq = this_rq(); + + for (i = 0; i < ticks; i++) + irqtime_account_process_tick(current, 0, rq); +} +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +static void irqtime_account_idle_ticks(int ticks) {} +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) {} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + struct rq *rq = this_rq(); + + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq); + return; + } + + if (steal_account_process_tick()) + return; + + if (user_tick) + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, + one_jiffy_scaled); + else + account_idle_time(cputime_one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + + account_idle_time(jiffies_to_cputime(ticks)); +} + +#endif + +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + *ut = p->utime; + *st = p->stime; +} + +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} + +/* + * Archs that account the whole time spent in the idle task + * (outside irq) as idle time can rely on this and just implement + * vtime_account_system() and vtime_account_idle(). Archs that + * have other meaning of the idle time (s390 only includes the + * time spent by the CPU when it's in low power mode) must override + * vtime_account(). + */ +#ifndef __ARCH_HAS_VTIME_ACCOUNT +void vtime_account(struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + + if (in_interrupt() || !is_idle_task(tsk)) + vtime_account_system(tsk); + else + vtime_account_idle(tsk); + + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(vtime_account); +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ + +#else + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) +#endif + +static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) +{ + u64 temp = (__force u64) rtime; + + temp *= (__force u64) utime; + + if (sizeof(cputime_t) == 4) + temp = div_u64(temp, (__force u32) total); + else + temp = div64_u64(temp, (__force u64) total); + + return (__force cputime_t) temp; +} + +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + cputime_t rtime, utime = p->utime, total = utime + p->stime; + + /* + * Use CFS's precise accounting: + */ + rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + + if (total) + utime = scale_utime(utime, rtime, total); + else + utime = rtime; + + /* + * Compare with previous values, to keep monotonicity: + */ + p->prev_utime = max(p->prev_utime, utime); + p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + + *ut = p->prev_utime; + *st = p->prev_stime; +} + +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct signal_struct *sig = p->signal; + struct task_cputime cputime; + cputime_t rtime, utime, total; + + thread_group_cputime(p, &cputime); + + total = cputime.utime + cputime.stime; + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); + + if (total) + utime = scale_utime(cputime.utime, rtime, total); + else + utime = rtime; + + sig->prev_utime = max(sig->prev_utime, utime); + sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); + + *ut = sig->prev_utime; + *st = sig->prev_stime; +} +#endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96e2b18b6283..6b800a14b990 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -597,7 +597,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se) /* * The idea is to set a period in which each task runs once. * - * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch + * When there are too many tasks (sched_nr_latency) we have to stretch * this period because otherwise the slices get too small. * * p = (nr <= nl) ? l : l*nr/nl @@ -2700,7 +2700,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; - int want_sd = 1; int sync = wake_flags & WF_SYNC; if (p->nr_cpus_allowed == 1) @@ -2718,48 +2717,21 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) continue; /* - * If power savings logic is enabled for a domain, see if we - * are not overloaded, if so, don't balance wider. - */ - if (tmp->flags & (SD_PREFER_LOCAL)) { - unsigned long power = 0; - unsigned long nr_running = 0; - unsigned long capacity; - int i; - - for_each_cpu(i, sched_domain_span(tmp)) { - power += power_of(i); - nr_running += cpu_rq(i)->cfs.nr_running; - } - - capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - - if (nr_running < capacity) - want_sd = 0; - } - - /* * If both cpu and prev_cpu are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { affine_sd = tmp; - want_affine = 0; - } - - if (!want_sd && !want_affine) break; + } - if (!(tmp->flags & sd_flag)) - continue; - - if (want_sd) + if (tmp->flags & sd_flag) sd = tmp; } if (affine_sd) { - if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) prev_cpu = cpu; new_cpu = select_idle_sibling(p, prev_cpu); @@ -4295,7 +4267,7 @@ redo: goto out_balanced; } - BUG_ON(busiest == this_rq); + BUG_ON(busiest == env.dst_rq); schedstat_add(sd, lb_imbalance[idle], env.imbalance); @@ -4316,7 +4288,7 @@ redo: update_h_load(env.src_cpu); more_balance: local_irq_save(flags); - double_rq_lock(this_rq, busiest); + double_rq_lock(env.dst_rq, busiest); /* * cur_ld_moved - load moved in current iteration @@ -4324,7 +4296,7 @@ more_balance: */ cur_ld_moved = move_tasks(&env); ld_moved += cur_ld_moved; - double_rq_unlock(this_rq, busiest); + double_rq_unlock(env.dst_rq, busiest); local_irq_restore(flags); if (env.flags & LBF_NEED_BREAK) { @@ -4360,8 +4332,7 @@ more_balance: if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && lb_iterations++ < max_lb_iterations) { - this_rq = cpu_rq(env.new_dst_cpu); - env.dst_rq = this_rq; + env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_SOME_PINNED; env.loop = 0; @@ -4646,7 +4617,7 @@ static void nohz_balancer_kick(int cpu) return; } -static inline void clear_nohz_tick_stopped(int cpu) +static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); @@ -4686,28 +4657,23 @@ void set_cpu_sd_state_idle(void) } /* - * This routine will record that this cpu is going idle with tick stopped. + * This routine will record that the cpu is going idle with tick stopped. * This info will be used in performing idle load balancing in the future. */ -void select_nohz_load_balancer(int stop_tick) +void nohz_balance_enter_idle(int cpu) { - int cpu = smp_processor_id(); - /* * If this cpu is going down, then nothing needs to be done. */ if (!cpu_active(cpu)) return; - if (stop_tick) { - if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) - return; + if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) + return; - cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - atomic_inc(&nohz.nr_cpus); - set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - } - return; + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); + atomic_inc(&nohz.nr_cpus); + set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, @@ -4715,7 +4681,7 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, { switch (action & ~CPU_TASKS_FROZEN) { case CPU_DYING: - clear_nohz_tick_stopped(smp_processor_id()); + nohz_balance_exit_idle(smp_processor_id()); return NOTIFY_OK; default: return NOTIFY_DONE; @@ -4837,14 +4803,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) if (need_resched()) break; - raw_spin_lock_irq(&this_rq->lock); - update_rq_clock(this_rq); - update_idle_cpu_load(this_rq); - raw_spin_unlock_irq(&this_rq->lock); + rq = cpu_rq(balance_cpu); + + raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + raw_spin_unlock_irq(&rq->lock); rebalance_domains(balance_cpu, CPU_IDLE); - rq = cpu_rq(balance_cpu); if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; } @@ -4875,7 +4842,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) * busy tick after returning from idle, we will update the busy stats. */ set_cpu_sd_state_busy(); - clear_nohz_tick_stopped(cpu); + nohz_balance_exit_idle(cpu); /* * None are in tickless mode and hence no need for NOHZ idle load diff --git a/kernel/sched/features.h b/kernel/sched/features.h index de00a486c5c6..eebefcad7027 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) SCHED_FEAT(START_DEBIT, true) /* - * Based on load and program behaviour, see if it makes sense to place - * a newly woken task on the same cpu as the task that woke it -- - * improve cache locality. Typically used with SYNC wakeups as - * generated by pipes and the like, see also SYNC_WAKEUPS. - */ -SCHED_FEAT(AFFINE_WAKEUPS, true) - -/* * Prefer to schedule the task we woke last (assuming it failed * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. @@ -42,7 +34,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) /* * Use arch dependent cpu power functions */ -SCHED_FEAT(ARCH_POWER, false) +SCHED_FEAT(ARCH_POWER, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e0b7ba9c040f..418feb01344e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1632,11 +1632,6 @@ static int push_rt_task(struct rq *rq) if (!next_task) return 0; -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - if (unlikely(task_running(rq, next_task))) - return 0; -#endif - retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0848fa36c383..7a7db09cfabc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -737,11 +737,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) */ next->on_cpu = 1; #endif -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - raw_spin_unlock_irq(&rq->lock); -#else raw_spin_unlock(&rq->lock); -#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) @@ -755,9 +751,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) smp_wmb(); prev->on_cpu = 0; #endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); -#endif } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -891,6 +885,9 @@ struct cpuacct { struct kernel_cpustat __percpu *cpustat; }; +extern struct cgroup_subsys cpuacct_subsys; +extern struct cpuacct root_cpuacct; + /* return cpu accounting group corresponding to this container */ static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { @@ -917,6 +914,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) +{ + if (unlikely(steal > NSEC_PER_SEC)) + return div_u64(steal, TICK_NSEC); + + return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif + static inline void inc_nr_running(struct rq *rq) { rq->nr_running++; @@ -1156,3 +1163,53 @@ enum rq_nohz_flag_bits { #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) #endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +DECLARE_PER_CPU(u64, cpu_hardirq_time); +DECLARE_PER_CPU(u64, cpu_softirq_time); + +#ifndef CONFIG_64BIT +DECLARE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + + return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + diff --git a/kernel/signal.c b/kernel/signal.c index be4f856d52f8..0af8868525d6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -17,6 +17,7 @@ #include <linux/fs.h> #include <linux/tty.h> #include <linux/binfmts.h> +#include <linux/coredump.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ptrace.h> @@ -1971,13 +1972,8 @@ static void ptrace_do_notify(int signr, int exit_code, int why) void ptrace_notify(int exit_code) { BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); - if (unlikely(current->task_works)) { - if (test_and_clear_ti_thread_flag(current_thread_info(), - TIF_NOTIFY_RESUME)) { - smp_mb__after_clear_bit(); - task_work_run(); - } - } + if (unlikely(current->task_works)) + task_work_run(); spin_lock_irq(¤t->sighand->siglock); ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); @@ -2198,13 +2194,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct signal_struct *signal = current->signal; int signr; - if (unlikely(current->task_works)) { - if (test_and_clear_ti_thread_flag(current_thread_info(), - TIF_NOTIFY_RESUME)) { - smp_mb__after_clear_bit(); - task_work_run(); - } - } + if (unlikely(current->task_works)) + task_work_run(); if (unlikely(uprobe_deny_signal())) return 0; @@ -2369,7 +2360,7 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(info->si_signo, info->si_signo, regs); + do_coredump(info, regs); } /* diff --git a/kernel/softirq.c b/kernel/softirq.c index 5c6a5bd8462f..cc96bdc0c2c9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); - account_system_vtime(current); + vtime_account(current); __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); @@ -272,7 +272,7 @@ restart: lockdep_softirq_exit(); - account_system_vtime(current); + vtime_account(current); __local_bh_enable(SOFTIRQ_OFFSET); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } @@ -341,7 +341,7 @@ static inline void invoke_softirq(void) */ void irq_exit(void) { - account_system_vtime(current); + vtime_account(current); trace_hardirq_exit(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) diff --git a/kernel/srcu.c b/kernel/srcu.c index 2095be3318d5..97c465ebd844 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -379,7 +379,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, rcu_batch_queue(&sp->batch_queue, head); if (!sp->running) { sp->running = true; - queue_delayed_work(system_nrt_wq, &sp->work, 0); + schedule_delayed_work(&sp->work, 0); } spin_unlock_irqrestore(&sp->queue_lock, flags); } @@ -631,7 +631,7 @@ static void srcu_reschedule(struct srcu_struct *sp) } if (pending) - queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); + schedule_delayed_work(&sp->work, SRCU_INTERVAL); } /* diff --git a/kernel/sys.c b/kernel/sys.c index 241507f23eca..c5cb5b99cb81 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -368,6 +368,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); + disable_nonboot_cpus(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); else @@ -1788,15 +1789,15 @@ SYSCALL_DEFINE1(umask, int, mask) #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { - struct file *exe_file; + struct fd exe; struct dentry *dentry; int err; - exe_file = fget(fd); - if (!exe_file) + exe = fdget(fd); + if (!exe.file) return -EBADF; - dentry = exe_file->f_path.dentry; + dentry = exe.file->f_path.dentry; /* * Because the original mm->exe_file points to executable file, make @@ -1805,7 +1806,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) */ err = -EACCES; if (!S_ISREG(dentry->d_inode->i_mode) || - exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) + exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) goto exit; err = inode_permission(dentry->d_inode, MAY_EXEC); @@ -1839,12 +1840,12 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) goto exit_unlock; err = 0; - set_mm_exe_file(mm, exe_file); + set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ exit_unlock: up_write(&mm->mmap_sem); exit: - fput(exe_file); + fdput(exe); return err; } @@ -2204,7 +2205,7 @@ static int __orderly_poweroff(void) return -ENOMEM; } - ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, + ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, NULL, argv_cleanup, NULL); if (ret == -ENOMEM) argv_free(argv); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87174ef59161..26f65eaa01f9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -97,10 +97,12 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int max_threads; -extern int core_uses_pid; extern int suid_dumpable; +#ifdef CONFIG_COREDUMP +extern int core_uses_pid; extern char core_pattern[]; extern unsigned int core_pipe_limit; +#endif extern int pid_max; extern int min_free_kbytes; extern int pid_max_min, pid_max_max; @@ -177,8 +179,10 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ @@ -307,7 +311,7 @@ static struct ctl_table kern_table[] = { .extra2 = &max_sched_tunable_scaling, }, { - .procname = "sched_migration_cost", + .procname = "sched_migration_cost_ns", .data = &sysctl_sched_migration_cost, .maxlen = sizeof(unsigned int), .mode = 0644, @@ -321,14 +325,14 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "sched_time_avg", + .procname = "sched_time_avg_ms", .data = &sysctl_sched_time_avg, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { - .procname = "sched_shares_window", + .procname = "sched_shares_window_ns", .data = &sysctl_sched_shares_window, .maxlen = sizeof(unsigned int), .mode = 0644, @@ -404,6 +408,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_COREDUMP { .procname = "core_uses_pid", .data = &core_uses_pid, @@ -425,6 +430,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -1543,8 +1549,7 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { -#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ - defined(CONFIG_S390) || defined(CONFIG_TILE) +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { .procname = "exception-trace", .data = &show_unhandled_signals, @@ -2036,12 +2041,14 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, static void validate_coredump_safety(void) { +#ifdef CONFIG_COREDUMP if (suid_dumpable == SUID_DUMPABLE_SAFE && core_pattern[0] != '/' && core_pattern[0] != '|') { printk(KERN_WARNING "Unsafe core_pattern used with "\ "suid_dumpable=2. Pipe handler or fully qualified "\ "core dump path required.\n"); } +#endif } static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, @@ -2053,6 +2060,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, return error; } +#ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2061,6 +2069,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, validate_coredump_safety(); return error; } +#endif static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, void __user *buffer, diff --git a/kernel/task_work.c b/kernel/task_work.c index d320d44903bd..65bd3c92d6f3 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -2,26 +2,20 @@ #include <linux/task_work.h> #include <linux/tracehook.h> +static struct callback_head work_exited; /* all we need is ->next == NULL */ + int -task_work_add(struct task_struct *task, struct callback_head *twork, bool notify) +task_work_add(struct task_struct *task, struct callback_head *work, bool notify) { - struct callback_head *last, *first; - unsigned long flags; + struct callback_head *head; - /* - * Not inserting the new work if the task has already passed - * exit_task_work() is the responisbility of callers. - */ - raw_spin_lock_irqsave(&task->pi_lock, flags); - last = task->task_works; - first = last ? last->next : twork; - twork->next = first; - if (last) - last->next = twork; - task->task_works = twork; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + do { + head = ACCESS_ONCE(task->task_works); + if (unlikely(head == &work_exited)) + return -ESRCH; + work->next = head; + } while (cmpxchg(&task->task_works, head, work) != head); - /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ if (notify) set_notify_resume(task); return 0; @@ -30,52 +24,69 @@ task_work_add(struct task_struct *task, struct callback_head *twork, bool notify struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { + struct callback_head **pprev = &task->task_works; + struct callback_head *work = NULL; unsigned long flags; - struct callback_head *last, *res = NULL; - + /* + * If cmpxchg() fails we continue without updating pprev. + * Either we raced with task_work_add() which added the + * new entry before this work, we will find it again. Or + * we raced with task_work_run(), *pprev == NULL/exited. + */ raw_spin_lock_irqsave(&task->pi_lock, flags); - last = task->task_works; - if (last) { - struct callback_head *q = last, *p = q->next; - while (1) { - if (p->func == func) { - q->next = p->next; - if (p == last) - task->task_works = q == p ? NULL : q; - res = p; - break; - } - if (p == last) - break; - q = p; - p = q->next; - } + while ((work = ACCESS_ONCE(*pprev))) { + read_barrier_depends(); + if (work->func != func) + pprev = &work->next; + else if (cmpxchg(pprev, work, work->next) == work) + break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); - return res; + + return work; } void task_work_run(void) { struct task_struct *task = current; - struct callback_head *p, *q; + struct callback_head *work, *head, *next; + + for (;;) { + /* + * work->func() can do task_work_add(), do not set + * work_exited unless the list is empty. + */ + do { + work = ACCESS_ONCE(task->task_works); + head = !work && (task->flags & PF_EXITING) ? + &work_exited : NULL; + } while (cmpxchg(&task->task_works, work, head) != work); - while (1) { - raw_spin_lock_irq(&task->pi_lock); - p = task->task_works; - task->task_works = NULL; - raw_spin_unlock_irq(&task->pi_lock); + if (!work) + break; + /* + * Synchronize with task_work_cancel(). It can't remove + * the first entry == work, cmpxchg(task_works) should + * fail, but it can play with *work and other entries. + */ + raw_spin_unlock_wait(&task->pi_lock); + smp_mb(); - if (unlikely(!p)) - return; + /* Reverse the list to run the works in fifo order */ + head = NULL; + do { + next = work->next; + work->next = head; + head = work; + work = next; + } while (work); - q = p->next; /* head */ - p->next = NULL; /* cut it */ - while (q) { - p = q->next; - q->func(q); - q = p; + work = head; + do { + next = work->next; + work->func(work); + work = next; cond_resched(); - } + } while (work); } } diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d0a32796550f..145bb4d3bd4d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -27,6 +27,7 @@ #include <linux/cgroup.h> #include <linux/fs.h> #include <linux/file.h> +#include <linux/pid_namespace.h> #include <net/genetlink.h> #include <linux/atomic.h> @@ -174,7 +175,9 @@ static void send_cpu_listeners(struct sk_buff *skb, up_write(&listeners->sem); } -static void fill_stats(struct task_struct *tsk, struct taskstats *stats) +static void fill_stats(struct user_namespace *user_ns, + struct pid_namespace *pid_ns, + struct task_struct *tsk, struct taskstats *stats) { memset(stats, 0, sizeof(*stats)); /* @@ -190,7 +193,7 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats) stats->version = TASKSTATS_VERSION; stats->nvcsw = tsk->nvcsw; stats->nivcsw = tsk->nivcsw; - bacct_add_tsk(stats, tsk); + bacct_add_tsk(user_ns, pid_ns, stats, tsk); /* fill in extended acct fields */ xacct_add_tsk(stats, tsk); @@ -207,7 +210,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) rcu_read_unlock(); if (!tsk) return -ESRCH; - fill_stats(tsk, stats); + fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); put_task_struct(tsk); return 0; } @@ -291,6 +294,12 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; + if (current_user_ns() != &init_user_ns) + return -EINVAL; + + if (task_active_pid_ns(current) != &init_pid_ns) + return -EINVAL; + if (isadd == REGISTER) { for_each_cpu(cpu, mask) { s = kmalloc_node(sizeof(struct listener), @@ -415,16 +424,15 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) struct nlattr *na; size_t size; u32 fd; - struct file *file; - int fput_needed; + struct fd f; na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; if (!na) return -EINVAL; fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); - file = fget_light(fd, &fput_needed); - if (!file) + f = fdget(fd); + if (!f.file) return 0; size = nla_total_size(sizeof(struct cgroupstats)); @@ -437,6 +445,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, sizeof(struct cgroupstats)); if (na == NULL) { + nlmsg_free(rep_skb); rc = -EMSGSIZE; goto err; } @@ -444,7 +453,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) stats = nla_data(na); memset(stats, 0, sizeof(*stats)); - rc = cgroupstats_build(stats, file->f_dentry); + rc = cgroupstats_build(stats, f.file->f_dentry); if (rc < 0) { nlmsg_free(rep_skb); goto err; @@ -453,7 +462,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) rc = send_reply(rep_skb, info); err: - fput_light(file, fput_needed); + fdput(f); return rc; } @@ -467,7 +476,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info) rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); if (rc < 0) goto out; - rc = add_del_listener(info->snd_pid, mask, REGISTER); + rc = add_del_listener(info->snd_portid, mask, REGISTER); out: free_cpumask_var(mask); return rc; @@ -483,7 +492,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info) rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); if (rc < 0) goto out; - rc = add_del_listener(info->snd_pid, mask, DEREGISTER); + rc = add_del_listener(info->snd_portid, mask, DEREGISTER); out: free_cpumask_var(mask); return rc; @@ -631,11 +640,12 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (rc < 0) return; - stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, + task_pid_nr_ns(tsk, &init_pid_ns)); if (!stats) goto err; - fill_stats(tsk, stats); + fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); /* * Doesn't matter if tsk is the leader or the last group member leaving @@ -643,7 +653,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (!is_thread_group || !group_dead) goto send; - stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, + task_tgid_nr_ns(tsk, &init_pid_ns)); if (!stats) goto err; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 7e1ce012a851..30b6de0d977c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -397,6 +397,30 @@ void clockevents_exchange_device(struct clock_event_device *old, local_irq_restore(flags); } +/** + * clockevents_suspend - suspend clock devices + */ +void clockevents_suspend(void) +{ + struct clock_event_device *dev; + + list_for_each_entry_reverse(dev, &clockevent_devices, list) + if (dev->suspend) + dev->suspend(dev); +} + +/** + * clockevents_resume - resume clock devices + */ +void clockevents_resume(void) +{ + struct clock_event_device *dev; + + list_for_each_entry(dev, &clockevent_devices, list) + if (dev->resume) + dev->resume(dev); +} + #ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cf5f6b262673..f423bdd035c2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -372,7 +372,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - select_nohz_load_balancer(1); + nohz_balance_enter_idle(cpu); calc_load_enter_idle(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); @@ -570,7 +570,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ - select_nohz_load_balancer(0); tick_do_update_jiffies64(now); update_cpu_load_nohz(); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d3b91e75cecd..5ce06a3fa91e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -776,6 +776,7 @@ static void timekeeping_resume(void) read_persistent_clock(&ts); + clockevents_resume(); clocksource_resume(); write_seqlock_irqsave(&tk->lock, flags); @@ -835,6 +836,7 @@ static int timekeeping_suspend(void) clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); + clockevents_suspend(); return 0; } diff --git a/kernel/timer.c b/kernel/timer.c index 8c5e7b908c68..d5de1b2292aa 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -92,24 +92,25 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; /* Functions below help us manage 'deferrable' flag */ static inline unsigned int tbase_get_deferrable(struct tvec_base *base) { - return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); + return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); } -static inline struct tvec_base *tbase_get_base(struct tvec_base *base) +static inline unsigned int tbase_get_irqsafe(struct tvec_base *base) { - return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); + return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE); } -static inline void timer_set_deferrable(struct timer_list *timer) +static inline struct tvec_base *tbase_get_base(struct tvec_base *base) { - timer->base = TBASE_MAKE_DEFERRED(timer->base); + return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); } static inline void timer_set_base(struct timer_list *timer, struct tvec_base *new_base) { - timer->base = (struct tvec_base *)((unsigned long)(new_base) | - tbase_get_deferrable(timer->base)); + unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK; + + timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags); } static unsigned long round_jiffies_common(unsigned long j, int cpu, @@ -563,16 +564,14 @@ static inline void debug_timer_assert_init(struct timer_list *timer) debug_object_assert_init(timer, &timer_debug_descr); } -static void __init_timer(struct timer_list *timer, - const char *name, - struct lock_class_key *key); +static void do_init_timer(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key); -void init_timer_on_stack_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key) +void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) { debug_object_init_on_stack(timer, &timer_debug_descr); - __init_timer(timer, name, key); + do_init_timer(timer, flags, name, key); } EXPORT_SYMBOL_GPL(init_timer_on_stack_key); @@ -613,12 +612,13 @@ static inline void debug_assert_init(struct timer_list *timer) debug_timer_assert_init(timer); } -static void __init_timer(struct timer_list *timer, - const char *name, - struct lock_class_key *key) +static void do_init_timer(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) { + struct tvec_base *base = __raw_get_cpu_var(tvec_bases); + timer->entry.next = NULL; - timer->base = __raw_get_cpu_var(tvec_bases); + timer->base = (void *)((unsigned long)base | flags); timer->slack = -1; #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; @@ -628,22 +628,10 @@ static void __init_timer(struct timer_list *timer, lockdep_init_map(&timer->lockdep_map, name, key, 0); } -void setup_deferrable_timer_on_stack_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key, - void (*function)(unsigned long), - unsigned long data) -{ - timer->function = function; - timer->data = data; - init_timer_on_stack_key(timer, name, key); - timer_set_deferrable(timer); -} -EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); - /** * init_timer_key - initialize a timer * @timer: the timer to be initialized + * @flags: timer flags * @name: name of the timer * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies @@ -651,24 +639,14 @@ EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); * init_timer_key() must be done to a timer prior calling *any* of the * other timer functions. */ -void init_timer_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key) +void init_timer_key(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) { debug_init(timer); - __init_timer(timer, name, key); + do_init_timer(timer, flags, name, key); } EXPORT_SYMBOL(init_timer_key); -void init_timer_deferrable_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key) -{ - init_timer_key(timer, name, key); - timer_set_deferrable(timer); -} -EXPORT_SYMBOL(init_timer_deferrable_key); - static inline void detach_timer(struct timer_list *timer, bool clear_pending) { struct list_head *entry = &timer->entry; @@ -686,7 +664,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) { detach_timer(timer, true); if (!tbase_get_deferrable(timer->base)) - timer->base->active_timers--; + base->active_timers--; } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -697,7 +675,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, detach_timer(timer, clear_pending); if (!tbase_get_deferrable(timer->base)) { - timer->base->active_timers--; + base->active_timers--; if (timer->expires == base->next_timer) base->next_timer = base->timer_jiffies; } @@ -1029,14 +1007,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from - * interrupt contexts. The caller must not hold locks which would prevent - * completion of the timer's handler. The timer's handler must not call - * add_timer_on(). Upon exit the timer is not queued and the handler is - * not running on any CPU. + * interrupt contexts unless the timer is an irqsafe one. The caller must + * not hold locks which would prevent completion of the timer's + * handler. The timer's handler must not call add_timer_on(). Upon exit the + * timer is not queued and the handler is not running on any CPU. * - * Note: You must not hold locks that are held in interrupt context - * while calling this function. Even if the lock has nothing to do - * with the timer in question. Here's why: + * Note: For !irqsafe timers, you must not hold locks that are held in + * interrupt context while calling this function. Even if the lock has + * nothing to do with the timer in question. Here's why: * * CPU0 CPU1 * ---- ---- @@ -1073,7 +1051,7 @@ int del_timer_sync(struct timer_list *timer) * don't use it in hardirq context, because it * could lead to deadlock. */ - WARN_ON(in_irq()); + WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base)); for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -1180,19 +1158,27 @@ static inline void __run_timers(struct tvec_base *base) while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; + bool irqsafe; timer = list_first_entry(head, struct timer_list,entry); fn = timer->function; data = timer->data; + irqsafe = tbase_get_irqsafe(timer->base); timer_stats_account_timer(timer); base->running_timer = timer; detach_expired_timer(timer, base); - spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn, data); - spin_lock_irq(&base->lock); + if (irqsafe) { + spin_unlock(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock(&base->lock); + } else { + spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock_irq(&base->lock); + } } } base->running_timer = NULL; @@ -1791,9 +1777,13 @@ static struct notifier_block __cpuinitdata timers_nb = { void __init init_timers(void) { - int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); + int err; + + /* ensure there are enough low bits for flags in timer->base pointer */ + BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); + err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); init_timer_stats(); BUG_ON(err != NOTIFY_OK); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8c4c07071cc5..4cea4f41c1d9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS help See Documentation/trace/ftrace-design.txt +config HAVE_FENTRY + bool + help + Arch supports the gcc options -pg with -mfentry + config HAVE_C_RECORDMCOUNT bool help @@ -57,8 +62,12 @@ config HAVE_C_RECORDMCOUNT config TRACER_MAX_TRACE bool +config TRACE_CLOCK + bool + config RING_BUFFER bool + select TRACE_CLOCK config FTRACE_NMI_ENTER bool @@ -109,6 +118,7 @@ config TRACING select NOP_TRACER select BINARY_PRINTF select EVENT_TRACING + select TRACE_CLOCK config GENERIC_TRACER bool diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b831087c8200..d7e2068e4b71 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -5,10 +5,12 @@ ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) +ifdef CONFIG_FTRACE_SELFTEST # selftest needs instrumentation CFLAGS_trace_selftest_dynamic.o = -pg obj-y += trace_selftest_dynamic.o endif +endif # If unlikely tracing is enabled, do not trace these files ifdef CONFIG_TRACING_BRANCHES @@ -17,11 +19,7 @@ endif CFLAGS_trace_events_filter.o := -I$(src) -# -# Make the trace clocks available generally: it's infrastructure -# relied on by ptrace for example: -# -obj-y += trace_clock.o +obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b4f20fba09fc..9dcf15d38380 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -64,12 +64,20 @@ #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) +static struct ftrace_ops ftrace_list_end __read_mostly = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, +}; + /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; /* Quick disabling of function tracer. */ -int function_trace_stop; +int function_trace_stop __read_mostly; + +/* Current function tracing op */ +struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; /* List for set_ftrace_pid's pids. */ LIST_HEAD(ftrace_pids); @@ -86,22 +94,43 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops ftrace_list_end __read_mostly = { - .func = ftrace_stub, -}; - static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; -ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; static struct ftrace_ops control_ops; -static void -ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs); +#else +/* See comment below, where ftrace_ops_list_func is defined */ +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); +#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) +#endif + +/** + * ftrace_nr_registered_ops - return number of ops registered + * + * Returns the number of ftrace_ops registered and tracing functions + */ +int ftrace_nr_registered_ops(void) +{ + struct ftrace_ops *ops; + int cnt = 0; + + mutex_lock(&ftrace_lock); + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) + cnt++; + + mutex_unlock(&ftrace_lock); + + return cnt; +} /* * Traverse the ftrace_global_list, invoking all entries. The reason that we @@ -112,29 +141,29 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); * * Silly Alpha and silly pointer-speculation compiler optimizations! */ -static void ftrace_global_list_func(unsigned long ip, - unsigned long parent_ip) +static void +ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) { - struct ftrace_ops *op; - if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) return; trace_recursion_set(TRACE_GLOBAL_BIT); op = rcu_dereference_raw(ftrace_global_list); /*see above*/ while (op != &ftrace_list_end) { - op->func(ip, parent_ip); + op->func(ip, parent_ip, op, regs); op = rcu_dereference_raw(op->next); /*see above*/ }; trace_recursion_clear(TRACE_GLOBAL_BIT); } -static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) +static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) { if (!test_tsk_trace_trace(current)) return; - ftrace_pid_function(ip, parent_ip); + ftrace_pid_function(ip, parent_ip, op, regs); } static void set_ftrace_pid_function(ftrace_func_t func) @@ -153,25 +182,9 @@ static void set_ftrace_pid_function(ftrace_func_t func) void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; - __ftrace_trace_function = ftrace_stub; - __ftrace_trace_function_delay = ftrace_stub; ftrace_pid_function = ftrace_stub; } -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST -/* - * For those archs that do not test ftrace_trace_stop in their - * mcount call site, we need to do it from C. - */ -static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) -{ - if (function_trace_stop) - return; - - __ftrace_trace_function(ip, parent_ip); -} -#endif - static void control_ops_disable_all(struct ftrace_ops *ops) { int cpu; @@ -230,28 +243,27 @@ static void update_ftrace_function(void) /* * If we are at the end of the list and this ops is - * not dynamic, then have the mcount trampoline call - * the function directly + * recursion safe and not dynamic and the arch supports passing ops, + * then have the mcount trampoline call the function directly. */ if (ftrace_ops_list == &ftrace_list_end || (ftrace_ops_list->next == &ftrace_list_end && - !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) + !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && + (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && + !FTRACE_FORCE_LIST_FUNC)) { + /* Set the ftrace_ops that the arch callback uses */ + if (ftrace_ops_list == &global_ops) + function_trace_op = ftrace_global_list; + else + function_trace_op = ftrace_ops_list; func = ftrace_ops_list->func; - else + } else { + /* Just use the default ftrace_ops */ + function_trace_op = &ftrace_list_end; func = ftrace_ops_list_func; + } -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; -#else -#ifdef CONFIG_DYNAMIC_FTRACE - /* do not update till all functions have been modified */ - __ftrace_trace_function_delay = func; -#else - __ftrace_trace_function = func; -#endif - ftrace_trace_function = - (func == ftrace_stub) ? func : ftrace_test_stop_func; -#endif } static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) @@ -325,6 +337,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) return -EINVAL; +#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS + /* + * If the ftrace_ops specifies SAVE_REGS, then it only can be used + * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. + * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS && + !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)) + return -EINVAL; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) + ops->flags |= FTRACE_OPS_FL_SAVE_REGS; +#endif + if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; @@ -773,7 +799,8 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) } static void -function_profile_call(unsigned long ip, unsigned long parent_ip) +function_profile_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; @@ -803,7 +830,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int profile_graph_entry(struct ftrace_graph_ent *trace) { - function_profile_call(trace->func, 0); + function_profile_call(trace->func, 0, NULL, NULL); return 1; } @@ -863,6 +890,7 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static int register_ftrace_profiler(void) @@ -1045,6 +1073,7 @@ static struct ftrace_ops global_ops = { .func = ftrace_stub, .notrace_hash = EMPTY_HASH, .filter_hash = EMPTY_HASH, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static DEFINE_MUTEX(ftrace_regex_lock); @@ -1525,6 +1554,12 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, rec->flags++; if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) return; + /* + * If any ops wants regs saved for this function + * then all ops will get saved regs. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + rec->flags |= FTRACE_FL_REGS; } else { if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) return; @@ -1616,18 +1651,59 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) if (enable && (rec->flags & ~FTRACE_FL_MASK)) flag = FTRACE_FL_ENABLED; + /* + * If enabling and the REGS flag does not match the REGS_EN, then + * do not ignore this record. Set flags to fail the compare against + * ENABLED. + */ + if (flag && + (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) + flag |= FTRACE_FL_REGS; + /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) return FTRACE_UPDATE_IGNORE; if (flag) { - if (update) + /* Save off if rec is being enabled (for return value) */ + flag ^= rec->flags & FTRACE_FL_ENABLED; + + if (update) { rec->flags |= FTRACE_FL_ENABLED; - return FTRACE_UPDATE_MAKE_CALL; + if (flag & FTRACE_FL_REGS) { + if (rec->flags & FTRACE_FL_REGS) + rec->flags |= FTRACE_FL_REGS_EN; + else + rec->flags &= ~FTRACE_FL_REGS_EN; + } + } + + /* + * If this record is being updated from a nop, then + * return UPDATE_MAKE_CALL. + * Otherwise, if the EN flag is set, then return + * UPDATE_MODIFY_CALL_REGS to tell the caller to convert + * from the non-save regs, to a save regs function. + * Otherwise, + * return UPDATE_MODIFY_CALL to tell the caller to convert + * from the save regs, to a non-save regs function. + */ + if (flag & FTRACE_FL_ENABLED) + return FTRACE_UPDATE_MAKE_CALL; + else if (rec->flags & FTRACE_FL_REGS_EN) + return FTRACE_UPDATE_MODIFY_CALL_REGS; + else + return FTRACE_UPDATE_MODIFY_CALL; } - if (update) - rec->flags &= ~FTRACE_FL_ENABLED; + if (update) { + /* If there's no more users, clear all flags */ + if (!(rec->flags & ~FTRACE_FL_MASK)) + rec->flags = 0; + else + /* Just disable the record (keep REGS state) */ + rec->flags &= ~FTRACE_FL_ENABLED; + } return FTRACE_UPDATE_MAKE_NOP; } @@ -1662,13 +1738,17 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { + unsigned long ftrace_old_addr; unsigned long ftrace_addr; int ret; - ftrace_addr = (unsigned long)FTRACE_ADDR; - ret = ftrace_update_record(rec, enable); + if (rec->flags & FTRACE_FL_REGS) + ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; + else + ftrace_addr = (unsigned long)FTRACE_ADDR; + switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; @@ -1678,6 +1758,15 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) case FTRACE_UPDATE_MAKE_NOP: return ftrace_make_nop(NULL, rec, ftrace_addr); + + case FTRACE_UPDATE_MODIFY_CALL_REGS: + case FTRACE_UPDATE_MODIFY_CALL: + if (rec->flags & FTRACE_FL_REGS) + ftrace_old_addr = (unsigned long)FTRACE_ADDR; + else + ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; + + return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } return -1; /* unknow ftrace bug */ @@ -1882,16 +1971,6 @@ static void ftrace_run_update_code(int command) */ arch_ftrace_update_code(command); -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - /* - * For archs that call ftrace_test_stop_func(), we must - * wait till after we update all the function callers - * before we update the callback. This keeps different - * ops that record different functions from corrupting - * each other. - */ - __ftrace_trace_function = __ftrace_trace_function_delay; -#endif function_trace_stop--; ret = ftrace_arch_code_modify_post_process(); @@ -2441,8 +2520,9 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, "%ps", (void *)rec->ip); if (iter->flags & FTRACE_ITER_ENABLED) - seq_printf(m, " (%ld)", - rec->flags & ~FTRACE_FL_MASK); + seq_printf(m, " (%ld)%s", + rec->flags & ~FTRACE_FL_MASK, + rec->flags & FTRACE_FL_REGS ? " R" : ""); seq_printf(m, "\n"); return 0; @@ -2790,8 +2870,8 @@ static int __init ftrace_mod_cmd_init(void) } device_initcall(ftrace_mod_cmd_init); -static void -function_trace_probe_call(unsigned long ip, unsigned long parent_ip) +static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct ftrace_func_probe *entry; struct hlist_head *hhd; @@ -3162,8 +3242,27 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, } static int -ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, - int reset, int enable) +ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) +{ + struct ftrace_func_entry *entry; + + if (!ftrace_location(ip)) + return -EINVAL; + + if (remove) { + entry = ftrace_lookup_ip(hash, ip); + if (!entry) + return -ENOENT; + free_hash_entry(hash, entry); + return 0; + } + + return add_hash_entry(hash, ip); +} + +static int +ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, + unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; struct ftrace_hash *hash; @@ -3192,6 +3291,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, ret = -EINVAL; goto out_regex_unlock; } + if (ip) { + ret = ftrace_match_addr(hash, ip, remove); + if (ret < 0) + goto out_regex_unlock; + } mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); @@ -3208,6 +3312,37 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, return ret; } +static int +ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, + int reset, int enable) +{ + return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); +} + +/** + * ftrace_set_filter_ip - set a function to filter on in ftrace by address + * @ops - the ops to set the filter with + * @ip - the address to add to or remove from the filter. + * @remove - non zero to remove the ip from the filter + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled + * If @ip is NULL, it failes to update filter. + */ +int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, + int remove, int reset) +{ + return ftrace_set_addr(ops, ip, remove, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); + +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable) +{ + return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable); +} + /** * ftrace_set_filter - set a function to filter on in ftrace * @ops - the ops to set the filter with @@ -3912,6 +4047,7 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static int __init ftrace_nodyn_init(void) @@ -3942,10 +4078,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) #endif /* CONFIG_DYNAMIC_FTRACE */ static void -ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) +ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) { - struct ftrace_ops *op; - if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) return; @@ -3959,7 +4094,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) while (op != &ftrace_list_end) { if (!ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) - op->func(ip, parent_ip); + op->func(ip, parent_ip, op, regs); op = rcu_dereference_raw(op->next); }; @@ -3969,13 +4104,18 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops control_ops = { .func = ftrace_ops_control_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; -static void -ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) +static inline void +__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; + if (function_trace_stop) + return; + if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) return; @@ -3988,13 +4128,39 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) op = rcu_dereference_raw(ftrace_ops_list); while (op != &ftrace_list_end) { if (ftrace_ops_test(op, ip)) - op->func(ip, parent_ip); + op->func(ip, parent_ip, op, regs); op = rcu_dereference_raw(op->next); }; preempt_enable_notrace(); trace_recursion_clear(TRACE_INTERNAL_BIT); } +/* + * Some archs only support passing ip and parent_ip. Even though + * the list function ignores the op parameter, we do not want any + * C side effects, where a function is called without the caller + * sending a third parameter. + * Archs are to support both the regs and ftrace_ops at the same time. + * If they support ftrace_ops, it is assumed they support regs. + * If call backs want to use regs, they must either check for regs + * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. + * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. + * An architecture can pass partial regs with ftrace_ops and still + * set the ARCH_SUPPORT_FTARCE_OPS. + */ +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL, regs); +} +#else +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); +} +#endif + static void clear_ftrace_swapper(void) { struct task_struct *p; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 49491fa7daa2..b32ed0e385a5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2816,7 +2816,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable); * to the buffer after this will fail and return NULL. * * This is different than ring_buffer_record_disable() as - * it works like an on/off switch, where as the disable() verison + * it works like an on/off switch, where as the disable() version * must be paired with a enable(). */ void ring_buffer_record_off(struct ring_buffer *buffer) @@ -2839,7 +2839,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_off); * ring_buffer_record_off(). * * This is different than ring_buffer_record_enable() as - * it works like an on/off switch, where as the enable() verison + * it works like an on/off switch, where as the enable() version * must be paired with a disable(). */ void ring_buffer_record_on(struct ring_buffer *buffer) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c38c81496ce..31e4f55773f1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -328,7 +328,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO; + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; static int trace_stop_count; static DEFINE_RAW_SPINLOCK(tracing_start_lock); @@ -426,15 +426,15 @@ __setup("trace_buf_size=", set_buf_size); static int __init set_tracing_thresh(char *str) { - unsigned long threshhold; + unsigned long threshold; int ret; if (!str) return 0; - ret = strict_strtoul(str, 0, &threshhold); + ret = strict_strtoul(str, 0, &threshold); if (ret < 0) return 0; - tracing_thresh = threshhold * 1000; + tracing_thresh = threshold * 1000; return 1; } __setup("tracing_thresh=", set_tracing_thresh); @@ -470,6 +470,7 @@ static const char *trace_options[] = { "overwrite", "disable_on_free", "irq-info", + "markers", NULL }; @@ -2060,7 +2061,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "# -----------------\n"); seq_printf(m, "# | task: %.16s-%d " "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", - data->comm, data->pid, data->uid, data->nice, + data->comm, data->pid, + from_kuid_munged(seq_user_ns(m), data->uid), data->nice, data->policy, data->rt_priority); seq_puts(m, "# -----------------\n"); @@ -3886,6 +3888,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tracing_disabled) return -EINVAL; + if (!(trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; @@ -4195,12 +4200,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, buf->private = 0; } -static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -4216,7 +4215,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = buffer_pipe_buf_steal, + .steal = generic_pipe_buf_steal, .get = buffer_pipe_buf_get, }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 55e1f7f0db12..c15f528c1af4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -147,7 +147,7 @@ struct trace_array_cpu { unsigned long skipped_entries; cycle_t preempt_timestamp; pid_t pid; - uid_t uid; + kuid_t uid; char comm[TASK_COMM_LEN]; }; @@ -472,11 +472,11 @@ extern void trace_find_cmdline(int pid, char comm[]); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +#endif #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); -#endif extern int ring_buffer_expanded; extern bool tracing_selftest_disabled; @@ -680,6 +680,7 @@ enum trace_iterator_flags { TRACE_ITER_OVERWRITE = 0x200000, TRACE_ITER_STOP_ON_FREE = 0x400000, TRACE_ITER_IRQ_INFO = 0x800000, + TRACE_ITER_MARKERS = 0x1000000, }; /* diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 8a6d2ee2086c..84b1e045faba 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -258,7 +258,8 @@ EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); #ifdef CONFIG_FUNCTION_TRACER static void -perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *pt_regs) { struct ftrace_entry *entry; struct hlist_head *head; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 29111da1d100..d608d09d08c0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1199,6 +1199,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return 0; } +static void event_remove(struct ftrace_event_call *call) +{ + ftrace_event_enable_disable(call, 0); + if (call->event.funcs) + __unregister_ftrace_event(&call->event); + list_del(&call->list); +} + +static int event_init(struct ftrace_event_call *call) +{ + int ret = 0; + + if (WARN_ON(!call->name)) + return -EINVAL; + + if (call->class->raw_init) { + ret = call->class->raw_init(call); + if (ret < 0 && ret != -ENOSYS) + pr_warn("Could not initialize trace events/%s\n", + call->name); + } + + return ret; +} + static int __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, const struct file_operations *id, @@ -1209,19 +1234,9 @@ __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, struct dentry *d_events; int ret; - /* The linker may leave blanks */ - if (!call->name) - return -EINVAL; - - if (call->class->raw_init) { - ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace events/%s\n", - call->name); - return ret; - } - } + ret = event_init(call); + if (ret < 0) + return ret; d_events = event_trace_events_dir(); if (!d_events) @@ -1272,13 +1287,10 @@ static void remove_subsystem_dir(const char *name) */ static void __trace_remove_event_call(struct ftrace_event_call *call) { - ftrace_event_enable_disable(call, 0); - if (call->event.funcs) - __unregister_ftrace_event(&call->event); - debugfs_remove_recursive(call->dir); - list_del(&call->list); + event_remove(call); trace_destroy_fields(call); destroy_preds(call); + debugfs_remove_recursive(call->dir); remove_subsystem_dir(call->class->system); } @@ -1450,15 +1462,43 @@ static __init int setup_trace_event(char *str) } __setup("trace_event=", setup_trace_event); +static __init int event_trace_enable(void) +{ + struct ftrace_event_call **iter, *call; + char *buf = bootup_event_buf; + char *token; + int ret; + + for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { + + call = *iter; + ret = event_init(call); + if (!ret) + list_add(&call->list, &ftrace_events); + } + + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) + continue; + + ret = ftrace_set_clr_event(token, 1); + if (ret) + pr_warn("Failed to enable trace event: %s\n", token); + } + return 0; +} + static __init int event_trace_init(void) { - struct ftrace_event_call **call; + struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; int ret; - char *buf = bootup_event_buf; - char *token; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -1497,24 +1537,19 @@ static __init int event_trace_init(void) if (trace_define_common_fields()) pr_warning("tracing: Failed to allocate common fields"); - for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { - __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, + /* + * Early initialization already enabled ftrace event. + * Now it's only necessary to create the event directory. + */ + list_for_each_entry(call, &ftrace_events, list) { + + ret = event_create_dir(call, d_events, + &ftrace_event_id_fops, &ftrace_enable_fops, &ftrace_event_filter_fops, &ftrace_event_format_fops); - } - - while (true) { - token = strsep(&buf, ","); - - if (!token) - break; - if (!*token) - continue; - - ret = ftrace_set_clr_event(token, 1); - if (ret) - pr_warning("Failed to enable trace event: %s\n", token); + if (ret < 0) + event_remove(call); } ret = register_module_notifier(&trace_module_nb); @@ -1523,6 +1558,7 @@ static __init int event_trace_init(void) return 0; } +core_initcall(event_trace_enable); fs_initcall(event_trace_init); #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -1646,9 +1682,11 @@ static __init void event_trace_self_tests(void) event_test_stuff(); ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); - if (WARN_ON_ONCE(ret)) + if (WARN_ON_ONCE(ret)) { pr_warning("error disabling system %s\n", system->name); + continue; + } pr_cont("OK\n"); } @@ -1681,7 +1719,8 @@ static __init void event_trace_self_tests(void) static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void -function_test_events_call(unsigned long ip, unsigned long parent_ip) +function_test_events_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct ring_buffer_event *event; struct ring_buffer *buffer; @@ -1720,6 +1759,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __initdata = { .func = function_test_events_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static __init void event_trace_self_test_with_function(void) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 431dba8b7542..c154797a7ff7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2002,7 +2002,7 @@ static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, static int __ftrace_function_set_filter(int filter, char *buf, int len, struct function_filter_data *data) { - int i, re_cnt, ret; + int i, re_cnt, ret = -EINVAL; int *reset; char **re; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index a426f410c060..507a7a9630bf 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -13,7 +13,6 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> -#include <linux/pstore.h> #include <linux/fs.h> #include "trace.h" @@ -49,7 +48,8 @@ static void function_trace_start(struct trace_array *tr) } static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) +function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -75,16 +75,17 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) preempt_enable_notrace(); } -/* Our two options */ +/* Our option */ enum { TRACE_FUNC_OPT_STACK = 0x1, - TRACE_FUNC_OPT_PSTORE = 0x2, }; static struct tracer_flags func_flags; static void -function_trace_call(unsigned long ip, unsigned long parent_ip) +function_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) + { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -106,12 +107,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { - /* - * So far tracing doesn't support multiple buffers, so - * we make an explicit call for now. - */ - if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) - pstore_ftrace_call(ip, parent_ip); pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); } @@ -121,7 +116,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) } static void -function_stack_trace_call(unsigned long ip, unsigned long parent_ip) +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -164,22 +160,19 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = function_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops trace_stack_ops __read_mostly = { .func = function_stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif -#ifdef CONFIG_PSTORE_FTRACE - { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, -#endif { } /* Always set a last empty entry */ }; @@ -232,8 +225,6 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) } break; - case TRACE_FUNC_OPT_PSTORE: - break; default: return -EINVAL; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ce27c8ba8d31..99b4378393d5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -143,7 +143,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, return; } -#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST +#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) /* * The arch may choose to record the frame pointer used * and check it here to make sure that it is what we expect it @@ -154,6 +154,9 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, * * Currently, x86_32 with optimize for size (-Os) makes the latest * gcc do the above. + * + * Note, -mfentry does not use frame pointers, and this test + * is not needed if CC_USING_FENTRY is set. */ if (unlikely(current->ret_stack[index].fp != frame_pointer)) { ftrace_graph_stop(); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 99d20e920368..d98ee8283b29 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -136,7 +136,8 @@ static int func_prolog_dec(struct trace_array *tr, * irqsoff uses its own tracer function to keep the overhead down: */ static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; @@ -153,7 +154,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = irqsoff_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ff791ea48b57..02170c00c413 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -108,7 +108,8 @@ out_enable: * wakeup uses its own tracer function to keep the overhead down: */ static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; @@ -129,7 +130,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 288541f977fb..2c00a691a540 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -103,54 +103,67 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) static int trace_selftest_test_probe1_cnt; static void trace_selftest_test_probe1_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe1_cnt++; } static int trace_selftest_test_probe2_cnt; static void trace_selftest_test_probe2_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe2_cnt++; } static int trace_selftest_test_probe3_cnt; static void trace_selftest_test_probe3_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe3_cnt++; } static int trace_selftest_test_global_cnt; static void trace_selftest_test_global_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_global_cnt++; } static int trace_selftest_test_dyn_cnt; static void trace_selftest_test_dyn_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_dyn_cnt++; } static struct ftrace_ops test_probe1 = { .func = trace_selftest_test_probe1_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe2 = { .func = trace_selftest_test_probe2_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe3 = { .func = trace_selftest_test_probe3_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_global = { - .func = trace_selftest_test_global_func, - .flags = FTRACE_OPS_FL_GLOBAL, + .func = trace_selftest_test_global_func, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; static void print_counts(void) @@ -393,10 +406,253 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, return ret; } + +static int trace_selftest_recursion_cnt; +static void trace_selftest_test_recursion_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + /* + * This function is registered without the recursion safe flag. + * The ftrace infrastructure should provide the recursion + * protection. If not, this will crash the kernel! + */ + trace_selftest_recursion_cnt++; + DYN_FTRACE_TEST_NAME(); +} + +static void trace_selftest_test_recursion_safe_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + /* + * We said we would provide our own recursion. By calling + * this function again, we should recurse back into this function + * and count again. But this only happens if the arch supports + * all of ftrace features and nothing else is using the function + * tracing utility. + */ + if (trace_selftest_recursion_cnt++) + return; + DYN_FTRACE_TEST_NAME(); +} + +static struct ftrace_ops test_rec_probe = { + .func = trace_selftest_test_recursion_func, +}; + +static struct ftrace_ops test_recsafe_probe = { + .func = trace_selftest_test_recursion_safe_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static int +trace_selftest_function_recursion(void) +{ + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + char *func_name; + int len; + int ret; + int cnt; + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion: "); + + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + tracer_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_rec_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_rec_probe); + + ret = -1; + if (trace_selftest_recursion_cnt != 1) { + pr_cont("*callback not called once (%d)* ", + trace_selftest_recursion_cnt); + goto out; + } + + trace_selftest_recursion_cnt = 1; + + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion safe: "); + + ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_recsafe_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_recsafe_probe); + + /* + * If arch supports all ftrace features, and no other task + * was on the list, we should be fine. + */ + if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC) + cnt = 2; /* Should have recursed */ + else + cnt = 1; + + ret = -1; + if (trace_selftest_recursion_cnt != cnt) { + pr_cont("*callback not called expected %d times (%d)* ", + cnt, trace_selftest_recursion_cnt); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + return ret; +} #else # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +# define trace_selftest_function_recursion() ({ 0; }) #endif /* CONFIG_DYNAMIC_FTRACE */ +static enum { + TRACE_SELFTEST_REGS_START, + TRACE_SELFTEST_REGS_FOUND, + TRACE_SELFTEST_REGS_NOT_FOUND, +} trace_selftest_regs_stat; + +static void trace_selftest_test_regs_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + if (pt_regs) + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND; + else + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND; +} + +static struct ftrace_ops test_regs_probe = { + .func = trace_selftest_test_regs_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS, +}; + +static int +trace_selftest_function_regs(void) +{ + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + char *func_name; + int len; + int ret; + int supported = 0; + +#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS + supported = 1; +#endif + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace regs%s: ", + !supported ? "(no arch support)" : ""); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + tracer_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1); + /* + * If DYNAMIC_FTRACE is not set, then we just trace all functions. + * This test really doesn't care. + */ + if (ret && ret != -ENODEV) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_regs_probe); + /* + * Now if the arch does not support passing regs, then this should + * have failed. + */ + if (!supported) { + if (!ret) { + pr_cont("*registered save-regs without arch support* "); + goto out; + } + test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED; + ret = register_ftrace_function(&test_regs_probe); + } + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_regs_probe); + + ret = -1; + + switch (trace_selftest_regs_stat) { + case TRACE_SELFTEST_REGS_START: + pr_cont("*callback never called* "); + goto out; + + case TRACE_SELFTEST_REGS_FOUND: + if (supported) + break; + pr_cont("*callback received regs without arch support* "); + goto out; + + case TRACE_SELFTEST_REGS_NOT_FOUND: + if (!supported) + break; + pr_cont("*callback received NULL regs* "); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + return ret; +} + /* * Simple verification test of ftrace function tracer. * Enable ftrace, sleep 1/10 second, and then read the trace @@ -442,7 +698,14 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ret = trace_selftest_startup_dynamic_tracing(trace, tr, DYN_FTRACE_TEST_NAME); + if (ret) + goto out; + ret = trace_selftest_function_recursion(); + if (ret) + goto out; + + ret = trace_selftest_function_regs(); out: ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; @@ -778,6 +1041,8 @@ static int trace_wakeup_test_thread(void *data) set_current_state(TASK_INTERRUPTIBLE); schedule(); + complete(x); + /* we are awake, now wait to disappear */ while (!kthread_should_stop()) { /* @@ -821,24 +1086,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* reset the max latency */ tracing_max_latency = 0; - /* sleep to let the RT thread sleep too */ - msleep(100); + while (p->on_rq) { + /* + * Sleep to make sure the RT thread is asleep too. + * On virtual machines we can't rely on timings, + * but we want to make sure this test still works. + */ + msleep(100); + } - /* - * Yes this is slightly racy. It is possible that for some - * strange reason that the RT thread we created, did not - * call schedule for 100ms after doing the completion, - * and we do a wakeup on a task that already is awake. - * But that is extremely unlikely, and the worst thing that - * happens in such a case, is that we disable tracing. - * Honestly, if this race does happen something is horrible - * wrong with the system. - */ + init_completion(&isrt); wake_up_process(p); - /* give a little time to let the thread wake up */ - msleep(100); + /* Wait for the task to wake up */ + wait_for_completion(&isrt); /* stop the tracing. */ tracing_stop(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index d4545f49242e..0c1b165778e5 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -111,7 +111,8 @@ static inline void check_stack(void) } static void -stack_trace_call(unsigned long ip, unsigned long parent_ip) +stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { int cpu; @@ -136,6 +137,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static ssize_t diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 6b245f64c8dd..2485a7d09b11 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -487,7 +487,7 @@ int __init init_ftrace_syscalls(void) return 0; } -core_initcall(init_ftrace_syscalls); +early_initcall(init_ftrace_syscalls); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 23b4d784ebdd..625df0b44690 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -26,7 +26,9 @@ /* * fill in basic accounting fields */ -void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) +void bacct_add_tsk(struct user_namespace *user_ns, + struct pid_namespace *pid_ns, + struct taskstats *stats, struct task_struct *tsk) { const struct cred *tcred; struct timespec uptime, ts; @@ -55,13 +57,13 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_flag |= AXSIG; stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; - stats->ac_pid = tsk->pid; + stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); - stats->ac_uid = tcred->uid; - stats->ac_gid = tcred->gid; + stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); + stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); stats->ac_ppid = pid_alive(tsk) ? - rcu_dereference(tsk->real_parent)->tgid : 0; + task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; rcu_read_unlock(); stats->ac_utime = cputime_to_usecs(tsk->utime); stats->ac_stime = cputime_to_usecs(tsk->stime); diff --git a/kernel/user.c b/kernel/user.c index b815fefbe76f..750acffbe9ec 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -38,6 +38,14 @@ struct user_namespace init_user_ns = { .count = 4294967295U, }, }, + .projid_map = { + .nr_extents = 1, + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, + }, .kref = { .refcount = ATOMIC_INIT(3), }, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 86602316422d..456a6b9fba34 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -19,6 +19,7 @@ #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> +#include <linux/projid.h> static struct kmem_cache *user_ns_cachep __read_mostly; @@ -295,6 +296,75 @@ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) } EXPORT_SYMBOL(from_kgid_munged); +/** + * make_kprojid - Map a user-namespace projid pair into a kprojid. + * @ns: User namespace that the projid is in + * @projid: Project identifier + * + * Maps a user-namespace uid pair into a kernel internal kuid, + * and returns that kuid. + * + * When there is no mapping defined for the user-namespace projid + * pair INVALID_PROJID is returned. Callers are expected to test + * for and handle handle INVALID_PROJID being returned. INVALID_PROJID + * may be tested for using projid_valid(). + */ +kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) +{ + /* Map the uid to a global kernel uid */ + return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); +} +EXPORT_SYMBOL(make_kprojid); + +/** + * from_kprojid - Create a projid from a kprojid user-namespace pair. + * @targ: The user namespace we want a projid in. + * @kprojid: The kernel internal project identifier to start with. + * + * Map @kprojid into the user-namespace specified by @targ and + * return the resulting projid. + * + * There is always a mapping into the initial user_namespace. + * + * If @kprojid has no mapping in @targ (projid_t)-1 is returned. + */ +projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) +{ + /* Map the uid from a global kernel uid */ + return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); +} +EXPORT_SYMBOL(from_kprojid); + +/** + * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. + * @targ: The user namespace we want a projid in. + * @kprojid: The kernel internal projid to start with. + * + * Map @kprojid into the user-namespace specified by @targ and + * return the resulting projid. + * + * There is always a mapping into the initial user_namespace. + * + * Unlike from_kprojid from_kprojid_munged never fails and always + * returns a valid projid. This makes from_kprojid_munged + * appropriate for use in syscalls like stat and where + * failing the system call and failing to provide a valid projid are + * not an options. + * + * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. + */ +projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) +{ + projid_t projid; + projid = from_kprojid(targ, kprojid); + + if (projid == (projid_t) -1) + projid = OVERFLOW_PROJID; + return projid; +} +EXPORT_SYMBOL(from_kprojid_munged); + + static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; @@ -337,6 +407,27 @@ static int gid_m_show(struct seq_file *seq, void *v) return 0; } +static int projid_m_show(struct seq_file *seq, void *v) +{ + struct user_namespace *ns = seq->private; + struct uid_gid_extent *extent = v; + struct user_namespace *lower_ns; + projid_t lower; + + lower_ns = seq_user_ns(seq); + if ((lower_ns == ns) && lower_ns->parent) + lower_ns = lower_ns->parent; + + lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); + + seq_printf(seq, "%10u %10u %10u\n", + extent->first, + lower, + extent->count); + + return 0; +} + static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { struct uid_gid_extent *extent = NULL; @@ -362,6 +453,13 @@ static void *gid_m_start(struct seq_file *seq, loff_t *ppos) return m_start(seq, ppos, &ns->gid_map); } +static void *projid_m_start(struct seq_file *seq, loff_t *ppos) +{ + struct user_namespace *ns = seq->private; + + return m_start(seq, ppos, &ns->projid_map); +} + static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; @@ -387,6 +485,13 @@ struct seq_operations proc_gid_seq_operations = { .show = gid_m_show, }; +struct seq_operations proc_projid_seq_operations = { + .start = projid_m_start, + .stop = m_stop, + .next = m_next, + .show = projid_m_show, +}; + static DEFINE_MUTEX(id_map_mutex); static ssize_t map_write(struct file *file, const char __user *buf, @@ -434,7 +539,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* Require the appropriate privilege CAP_SETUID or CAP_SETGID * over the user namespace in order to set the id mapping. */ - if (!ns_capable(ns, cap_setid)) + if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) goto out; /* Get a buffer */ @@ -584,9 +689,30 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz &ns->gid_map, &ns->parent->gid_map); } +ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + struct user_namespace *seq_ns = seq_user_ns(seq); + + if (!ns->parent) + return -EPERM; + + if ((seq_ns != ns) && (seq_ns != ns->parent)) + return -EPERM; + + /* Anyone can set any valid project id no capability needed */ + return map_write(file, buf, size, ppos, -1, + &ns->projid_map, &ns->parent->projid_map); +} + static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { + /* Allow anyone to set a mapping that doesn't require privilege */ + if (!cap_valid(cap_setid)) + return true; + /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3c5a79e2134c..d951daa0ca9a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -58,7 +58,7 @@ enum { * be executing on any CPU. The gcwq behaves as an unbound one. * * Note that DISASSOCIATED can be flipped only while holding - * managership of all pools on the gcwq to avoid changing binding + * assoc_mutex of all pools on the gcwq to avoid changing binding * state while create_worker() is in progress. */ GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ @@ -73,11 +73,10 @@ enum { WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ - WORKER_REBIND = 1 << 5, /* mom is home, come back */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | WORKER_CPU_INTENSIVE, NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ @@ -126,7 +125,6 @@ enum { struct global_cwq; struct worker_pool; -struct idle_rebind; /* * The poor guys doing the actual heavy lifting. All on-duty workers @@ -150,7 +148,6 @@ struct worker { int id; /* I: worker id */ /* for rebinding worker to CPU */ - struct idle_rebind *idle_rebind; /* L: for idle worker */ struct work_struct rebind_work; /* L: for busy worker */ }; @@ -160,13 +157,15 @@ struct worker_pool { struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ + + /* nr_idle includes the ones off idle_list for rebinding */ int nr_idle; /* L: currently idle ones */ struct list_head idle_list; /* X: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ - struct mutex manager_mutex; /* mutex manager should hold */ + struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */ struct ida worker_ida; /* L: for worker IDs */ }; @@ -184,9 +183,8 @@ struct global_cwq { struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; /* L: hash of busy workers */ - struct worker_pool pools[2]; /* normal and highpri pools */ - - wait_queue_head_t rebind_hold; /* rebind hold wait */ + struct worker_pool pools[NR_WORKER_POOLS]; + /* normal and highpri pools */ } ____cacheline_aligned_in_smp; /* @@ -269,17 +267,15 @@ struct workqueue_struct { }; struct workqueue_struct *system_wq __read_mostly; -struct workqueue_struct *system_long_wq __read_mostly; -struct workqueue_struct *system_nrt_wq __read_mostly; -struct workqueue_struct *system_unbound_wq __read_mostly; -struct workqueue_struct *system_freezable_wq __read_mostly; -struct workqueue_struct *system_nrt_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); +struct workqueue_struct *system_highpri_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_highpri_wq); +struct workqueue_struct *system_long_wq __read_mostly; EXPORT_SYMBOL_GPL(system_long_wq); -EXPORT_SYMBOL_GPL(system_nrt_wq); +struct workqueue_struct *system_unbound_wq __read_mostly; EXPORT_SYMBOL_GPL(system_unbound_wq); +struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); -EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> @@ -534,18 +530,24 @@ static int work_next_color(int color) } /* - * A work's data points to the cwq with WORK_STRUCT_CWQ set while the - * work is on queue. Once execution starts, WORK_STRUCT_CWQ is - * cleared and the work data contains the cpu number it was last on. + * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data + * contain the pointer to the queued cwq. Once execution starts, the flag + * is cleared and the high bits contain OFFQ flags and CPU number. * - * set_work_{cwq|cpu}() and clear_work_data() can be used to set the - * cwq, cpu or clear work->data. These functions should only be - * called while the work is owned - ie. while the PENDING bit is set. + * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() + * and clear_work_data() can be used to set the cwq, cpu or clear + * work->data. These functions should only be called while the work is + * owned - ie. while the PENDING bit is set. * - * get_work_[g]cwq() can be used to obtain the gcwq or cwq - * corresponding to a work. gcwq is available once the work has been - * queued anywhere after initialization. cwq is available only from - * queueing until execution starts. + * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to + * a work. gcwq is available once the work has been queued anywhere after + * initialization until it is sync canceled. cwq is available only while + * the work item is queued. + * + * %WORK_OFFQ_CANCELING is used to mark a work item which is being + * canceled. While being canceled, a work item may have its PENDING set + * but stay off timer and worklist for arbitrarily long and nobody should + * try to steal the PENDING bit. */ static inline void set_work_data(struct work_struct *work, unsigned long data, unsigned long flags) @@ -562,13 +564,22 @@ static void set_work_cwq(struct work_struct *work, WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); } -static void set_work_cpu(struct work_struct *work, unsigned int cpu) +static void set_work_cpu_and_clear_pending(struct work_struct *work, + unsigned int cpu) { - set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); + /* + * The following wmb is paired with the implied mb in + * test_and_set_bit(PENDING) and ensures all updates to @work made + * here are visible to and precede any updates by the next PENDING + * owner. + */ + smp_wmb(); + set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); } static void clear_work_data(struct work_struct *work) { + smp_wmb(); /* see set_work_cpu_and_clear_pending() */ set_work_data(work, WORK_STRUCT_NO_CPU, 0); } @@ -591,7 +602,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) return ((struct cpu_workqueue_struct *) (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; - cpu = data >> WORK_STRUCT_FLAG_BITS; + cpu = data >> WORK_OFFQ_CPU_SHIFT; if (cpu == WORK_CPU_NONE) return NULL; @@ -599,6 +610,22 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) return get_gcwq(cpu); } +static void mark_work_canceling(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE; + + set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, + WORK_STRUCT_PENDING); +} + +static bool work_is_canceling(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + + return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); +} + /* * Policy functions. These define the policies on how the global worker * pools are managed. Unless noted otherwise, these functions assume that @@ -657,6 +684,13 @@ static bool too_many_workers(struct worker_pool *pool) int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; + /* + * nr_idle and idle_list may disagree if idle rebinding is in + * progress. Never return %true if idle_list is empty. + */ + if (list_empty(&pool->idle_list)) + return false; + return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } @@ -903,6 +937,206 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, } /** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out paramter for nested worklist walking + * + * Schedule linked works starting from @work to @head. Work series to + * be scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. + * + * If @nextp is not NULL, it's updated to point to the next work of + * the last scheduled work. This allows move_linked_works() to be + * nested inside outer list_for_each_entry_safe(). + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, + struct work_struct **nextp) +{ + struct work_struct *n; + + /* + * Linked worklist will always end before the end of the list, + * use NULL for list head. + */ + list_for_each_entry_safe_from(work, n, NULL, entry) { + list_move_tail(&work->entry, head); + if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) + break; + } + + /* + * If we're already inside safe list traversal and have moved + * multiple works to the scheduled queue, the next position + * needs to be updated. + */ + if (nextp) + *nextp = n; +} + +static void cwq_activate_delayed_work(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq = get_work_cwq(work); + + trace_workqueue_activate_work(work); + move_linked_works(work, &cwq->pool->worklist, NULL); + __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); + cwq->nr_active++; +} + +static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +{ + struct work_struct *work = list_first_entry(&cwq->delayed_works, + struct work_struct, entry); + + cwq_activate_delayed_work(work); +} + +/** + * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight + * @cwq: cwq of interest + * @color: color of work which left the queue + * + * A work either has completed or is removed from pending queue, + * decrement nr_in_flight of its cwq and handle workqueue flushing. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) +{ + /* ignore uncolored works */ + if (color == WORK_NO_COLOR) + return; + + cwq->nr_in_flight[color]--; + + cwq->nr_active--; + if (!list_empty(&cwq->delayed_works)) { + /* one down, submit a delayed one */ + if (cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); + } + + /* is flush in progress and are we at the flushing tip? */ + if (likely(cwq->flush_color != color)) + return; + + /* are there still in-flight works? */ + if (cwq->nr_in_flight[color]) + return; + + /* this cwq is done, clear flush_color */ + cwq->flush_color = -1; + + /* + * If this was the last cwq, wake up the first flusher. It + * will handle the rest. + */ + if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) + complete(&cwq->wq->first_flusher->done); +} + +/** + * try_to_grab_pending - steal work item from worklist and disable irq + * @work: work item to steal + * @is_dwork: @work is a delayed_work + * @flags: place to store irq state + * + * Try to grab PENDING bit of @work. This function can handle @work in any + * stable state - idle, on timer or on worklist. Return values are + * + * 1 if @work was pending and we successfully stole PENDING + * 0 if @work was idle and we claimed PENDING + * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry + * -ENOENT if someone else is canceling @work, this state may persist + * for arbitrarily long + * + * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting + * interrupted while holding PENDING and @work off queue, irq must be + * disabled on entry. This, combined with delayed_work->timer being + * irqsafe, ensures that we return -EAGAIN for finite short period of time. + * + * On successful return, >= 0, irq is disabled and the caller is + * responsible for releasing it using local_irq_restore(*@flags). + * + * This function is safe to call from any context including IRQ handler. + */ +static int try_to_grab_pending(struct work_struct *work, bool is_dwork, + unsigned long *flags) +{ + struct global_cwq *gcwq; + + local_irq_save(*flags); + + /* try to steal the timer if it exists */ + if (is_dwork) { + struct delayed_work *dwork = to_delayed_work(work); + + /* + * dwork->timer is irqsafe. If del_timer() fails, it's + * guaranteed that the timer is not queued anywhere and not + * running on the local CPU. + */ + if (likely(del_timer(&dwork->timer))) + return 1; + } + + /* try to claim PENDING the normal way */ + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) + return 0; + + /* + * The queueing is in progress, or it is already queued. Try to + * steal it from ->worklist without clearing WORK_STRUCT_PENDING. + */ + gcwq = get_work_gcwq(work); + if (!gcwq) + goto fail; + + spin_lock(&gcwq->lock); + if (!list_empty(&work->entry)) { + /* + * This work is queued, but perhaps we locked the wrong gcwq. + * In that case we must see the new value after rmb(), see + * insert_work()->wmb(). + */ + smp_rmb(); + if (gcwq == get_work_gcwq(work)) { + debug_work_deactivate(work); + + /* + * A delayed work item cannot be grabbed directly + * because it might have linked NO_COLOR work items + * which, if left on the delayed_list, will confuse + * cwq->nr_active management later on and cause + * stall. Make sure the work item is activated + * before grabbing. + */ + if (*work_data_bits(work) & WORK_STRUCT_DELAYED) + cwq_activate_delayed_work(work); + + list_del_init(&work->entry); + cwq_dec_nr_in_flight(get_work_cwq(work), + get_work_color(work)); + + spin_unlock(&gcwq->lock); + return 1; + } + } + spin_unlock(&gcwq->lock); +fail: + local_irq_restore(*flags); + if (work_is_canceling(work)) + return -ENOENT; + cpu_relax(); + return -EAGAIN; +} + +/** * insert_work - insert a work into gcwq * @cwq: cwq @work belongs to * @work: work to insert @@ -982,7 +1216,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct cpu_workqueue_struct *cwq; struct list_head *worklist; unsigned int work_flags; - unsigned long flags; + unsigned int req_cpu = cpu; + + /* + * While a work item is PENDING && off queue, a task trying to + * steal the PENDING will busy-loop waiting for it to either get + * queued or lose PENDING. Grabbing PENDING and queueing should + * happen with IRQ disabled. + */ + WARN_ON_ONCE(!irqs_disabled()); debug_work_activate(work); @@ -995,21 +1237,22 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, if (!(wq->flags & WQ_UNBOUND)) { struct global_cwq *last_gcwq; - if (unlikely(cpu == WORK_CPU_UNBOUND)) + if (cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); /* - * It's multi cpu. If @wq is non-reentrant and @work - * was previously on a different cpu, it might still - * be running there, in which case the work needs to - * be queued on that cpu to guarantee non-reentrance. + * It's multi cpu. If @work was previously on a different + * cpu, it might still be running there, in which case the + * work needs to be queued on that cpu to guarantee + * non-reentrancy. */ gcwq = get_gcwq(cpu); - if (wq->flags & WQ_NON_REENTRANT && - (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { + last_gcwq = get_work_gcwq(work); + + if (last_gcwq && last_gcwq != gcwq) { struct worker *worker; - spin_lock_irqsave(&last_gcwq->lock, flags); + spin_lock(&last_gcwq->lock); worker = find_worker_executing_work(last_gcwq, work); @@ -1017,22 +1260,23 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, gcwq = last_gcwq; else { /* meh... not running there, queue here */ - spin_unlock_irqrestore(&last_gcwq->lock, flags); - spin_lock_irqsave(&gcwq->lock, flags); + spin_unlock(&last_gcwq->lock); + spin_lock(&gcwq->lock); } - } else - spin_lock_irqsave(&gcwq->lock, flags); + } else { + spin_lock(&gcwq->lock); + } } else { gcwq = get_gcwq(WORK_CPU_UNBOUND); - spin_lock_irqsave(&gcwq->lock, flags); + spin_lock(&gcwq->lock); } /* gcwq determined, get cwq and queue */ cwq = get_cwq(gcwq->cpu, wq); - trace_workqueue_queue_work(cpu, cwq, work); + trace_workqueue_queue_work(req_cpu, cwq, work); if (WARN_ON(!list_empty(&work->entry))) { - spin_unlock_irqrestore(&gcwq->lock, flags); + spin_unlock(&gcwq->lock); return; } @@ -1050,79 +1294,110 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, insert_work(cwq, work, worklist, work_flags); - spin_unlock_irqrestore(&gcwq->lock, flags); + spin_unlock(&gcwq->lock); } /** - * queue_work - queue work on a workqueue + * queue_work_on - queue work on specific cpu + * @cpu: CPU number to execute work on * @wq: workqueue to use * @work: work to queue * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Returns %false if @work was already on a queue, %true otherwise. * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. + * We queue the work to a specific CPU, the caller must ensure it + * can't go away. */ -int queue_work(struct workqueue_struct *wq, struct work_struct *work) +bool queue_work_on(int cpu, struct workqueue_struct *wq, + struct work_struct *work) { - int ret; + bool ret = false; + unsigned long flags; - ret = queue_work_on(get_cpu(), wq, work); - put_cpu(); + local_irq_save(flags); + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_work(cpu, wq, work); + ret = true; + } + local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(queue_work); +EXPORT_SYMBOL_GPL(queue_work_on); /** - * queue_work_on - queue work on specific cpu - * @cpu: CPU number to execute work on + * queue_work - queue work on a workqueue * @wq: workqueue to use * @work: work to queue * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Returns %false if @work was already on a queue, %true otherwise. * - * We queue the work to a specific CPU, the caller must ensure it - * can't go away. + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. */ -int -queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) +bool queue_work(struct workqueue_struct *wq, struct work_struct *work) { - int ret = 0; - - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - __queue_work(cpu, wq, work); - ret = 1; - } - return ret; + return queue_work_on(WORK_CPU_UNBOUND, wq, work); } -EXPORT_SYMBOL_GPL(queue_work_on); +EXPORT_SYMBOL_GPL(queue_work); -static void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); - __queue_work(smp_processor_id(), cwq->wq, &dwork->work); + /* should have been called from irqsafe timer with irq already off */ + __queue_work(dwork->cpu, cwq->wq, &dwork->work); } +EXPORT_SYMBOL_GPL(delayed_work_timer_fn); -/** - * queue_delayed_work - queue work on a workqueue after delay - * @wq: workqueue to use - * @dwork: delayable work to queue - * @delay: number of jiffies to wait before queueing - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - */ -int queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) +static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) { - if (delay == 0) - return queue_work(wq, &dwork->work); + struct timer_list *timer = &dwork->timer; + struct work_struct *work = &dwork->work; + unsigned int lcpu; + + WARN_ON_ONCE(timer->function != delayed_work_timer_fn || + timer->data != (unsigned long)dwork); + BUG_ON(timer_pending(timer)); + BUG_ON(!list_empty(&work->entry)); + + timer_stats_timer_set_start_info(&dwork->timer); + + /* + * This stores cwq for the moment, for the timer_fn. Note that the + * work's gcwq is preserved to allow reentrance detection for + * delayed works. + */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *gcwq = get_work_gcwq(work); - return queue_delayed_work_on(-1, wq, dwork, delay); + /* + * If we cannot get the last gcwq from @work directly, + * select the last CPU such that it avoids unnecessarily + * triggering non-reentrancy check in __queue_work(). + */ + lcpu = cpu; + if (gcwq) + lcpu = gcwq->cpu; + if (lcpu == WORK_CPU_UNBOUND) + lcpu = raw_smp_processor_id(); + } else { + lcpu = WORK_CPU_UNBOUND; + } + + set_work_cwq(work, get_cwq(lcpu, wq), 0); + + dwork->cpu = cpu; + timer->expires = jiffies + delay; + + if (unlikely(cpu != WORK_CPU_UNBOUND)) + add_timer_on(timer, cpu); + else + add_timer(timer); } -EXPORT_SYMBOL_GPL(queue_delayed_work); /** * queue_delayed_work_on - queue work on specific CPU after delay @@ -1131,53 +1406,100 @@ EXPORT_SYMBOL_GPL(queue_delayed_work); * @dwork: work to queue * @delay: number of jiffies to wait before queueing * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Returns %false if @work was already on a queue, %true otherwise. If + * @delay is zero and @dwork is idle, it will be scheduled for immediate + * execution. */ -int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) +bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) { - int ret = 0; - struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + bool ret = false; + unsigned long flags; - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - unsigned int lcpu; + if (!delay) + return queue_work_on(cpu, wq, &dwork->work); - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); + /* read the comment in __queue_work() */ + local_irq_save(flags); - timer_stats_timer_set_start_info(&dwork->timer); + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_delayed_work(cpu, wq, dwork, delay); + ret = true; + } - /* - * This stores cwq for the moment, for the timer_fn. - * Note that the work's gcwq is preserved to allow - * reentrance detection for delayed works. - */ - if (!(wq->flags & WQ_UNBOUND)) { - struct global_cwq *gcwq = get_work_gcwq(work); + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(queue_delayed_work_on); - if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) - lcpu = gcwq->cpu; - else - lcpu = raw_smp_processor_id(); - } else - lcpu = WORK_CPU_UNBOUND; +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @dwork: delayable work to queue + * @delay: number of jiffies to wait before queueing + * + * Equivalent to queue_delayed_work_on() but tries to use the local CPU. + */ +bool queue_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} +EXPORT_SYMBOL_GPL(queue_delayed_work); - set_work_cwq(work, get_cwq(lcpu, wq), 0); +/** + * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, + * modify @dwork's timer so that it expires after @delay. If @delay is + * zero, @work is guaranteed to be scheduled immediately regardless of its + * current state. + * + * Returns %false if @dwork was idle and queued, %true if @dwork was + * pending and its timer was modified. + * + * This function is safe to call from any context including IRQ handler. + * See try_to_grab_pending() for details. + */ +bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + unsigned long flags; + int ret; - timer->expires = jiffies + delay; - timer->data = (unsigned long)dwork; - timer->function = delayed_work_timer_fn; + do { + ret = try_to_grab_pending(&dwork->work, true, &flags); + } while (unlikely(ret == -EAGAIN)); - if (unlikely(cpu >= 0)) - add_timer_on(timer, cpu); - else - add_timer(timer); - ret = 1; + if (likely(ret >= 0)) { + __queue_delayed_work(cpu, wq, dwork, delay); + local_irq_restore(flags); } + + /* -ENOENT from try_to_grab_pending() becomes %true */ return ret; } -EXPORT_SYMBOL_GPL(queue_delayed_work_on); +EXPORT_SYMBOL_GPL(mod_delayed_work_on); + +/** + * mod_delayed_work - modify delay of or queue a delayed work + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * mod_delayed_work_on() on local CPU. + */ +bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, + unsigned long delay) +{ + return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} +EXPORT_SYMBOL_GPL(mod_delayed_work); /** * worker_enter_idle - enter idle state @@ -1305,37 +1627,21 @@ __acquires(&gcwq->lock) } } -struct idle_rebind { - int cnt; /* # workers to be rebound */ - struct completion done; /* all workers rebound */ -}; - /* - * Rebind an idle @worker to its CPU. During CPU onlining, this has to - * happen synchronously for idle workers. worker_thread() will test - * %WORKER_REBIND before leaving idle and call this function. + * Rebind an idle @worker to its CPU. worker_thread() will test + * list_empty(@worker->entry) before leaving idle and call this function. */ static void idle_worker_rebind(struct worker *worker) { struct global_cwq *gcwq = worker->pool->gcwq; - /* CPU must be online at this point */ - WARN_ON(!worker_maybe_bind_and_lock(worker)); - if (!--worker->idle_rebind->cnt) - complete(&worker->idle_rebind->done); - spin_unlock_irq(&worker->pool->gcwq->lock); + /* CPU may go down again inbetween, clear UNBOUND only on success */ + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_UNBOUND); - /* we did our part, wait for rebind_workers() to finish up */ - wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); - - /* - * rebind_workers() shouldn't finish until all workers passed the - * above WORKER_REBIND wait. Tell it when done. - */ - spin_lock_irq(&worker->pool->gcwq->lock); - if (!--worker->idle_rebind->cnt) - complete(&worker->idle_rebind->done); - spin_unlock_irq(&worker->pool->gcwq->lock); + /* rebind complete, become available again */ + list_add(&worker->entry, &worker->pool->idle_list); + spin_unlock_irq(&gcwq->lock); } /* @@ -1349,16 +1655,8 @@ static void busy_worker_rebind_fn(struct work_struct *work) struct worker *worker = container_of(work, struct worker, rebind_work); struct global_cwq *gcwq = worker->pool->gcwq; - worker_maybe_bind_and_lock(worker); - - /* - * %WORKER_REBIND must be cleared even if the above binding failed; - * otherwise, we may confuse the next CPU_UP cycle or oops / get - * stuck by calling idle_worker_rebind() prematurely. If CPU went - * down again inbetween, %WORKER_UNBOUND would be set, so clearing - * %WORKER_REBIND is always safe. - */ - worker_clr_flags(worker, WORKER_REBIND); + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_UNBOUND); spin_unlock_irq(&gcwq->lock); } @@ -1370,123 +1668,74 @@ static void busy_worker_rebind_fn(struct work_struct *work) * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding * is different for idle and busy ones. * - * The idle ones should be rebound synchronously and idle rebinding should - * be complete before any worker starts executing work items with - * concurrency management enabled; otherwise, scheduler may oops trying to - * wake up non-local idle worker from wq_worker_sleeping(). + * Idle ones will be removed from the idle_list and woken up. They will + * add themselves back after completing rebind. This ensures that the + * idle_list doesn't contain any unbound workers when re-bound busy workers + * try to perform local wake-ups for concurrency management. * - * This is achieved by repeatedly requesting rebinding until all idle - * workers are known to have been rebound under @gcwq->lock and holding all - * idle workers from becoming busy until idle rebinding is complete. + * Busy workers can rebind after they finish their current work items. + * Queueing the rebind work item at the head of the scheduled list is + * enough. Note that nr_running will be properly bumped as busy workers + * rebind. * - * Once idle workers are rebound, busy workers can be rebound as they - * finish executing their current work items. Queueing the rebind work at - * the head of their scheduled lists is enough. Note that nr_running will - * be properbly bumped as busy workers rebind. - * - * On return, all workers are guaranteed to either be bound or have rebind - * work item scheduled. + * On return, all non-manager workers are scheduled for rebind - see + * manage_workers() for the manager special case. Any idle worker + * including the manager will not appear on @idle_list until rebind is + * complete, making local wake-ups safe. */ static void rebind_workers(struct global_cwq *gcwq) - __releases(&gcwq->lock) __acquires(&gcwq->lock) { - struct idle_rebind idle_rebind; struct worker_pool *pool; - struct worker *worker; + struct worker *worker, *n; struct hlist_node *pos; int i; lockdep_assert_held(&gcwq->lock); for_each_worker_pool(pool, gcwq) - lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->assoc_mutex); - /* - * Rebind idle workers. Interlocked both ways. We wait for - * workers to rebind via @idle_rebind.done. Workers will wait for - * us to finish up by watching %WORKER_REBIND. - */ - init_completion(&idle_rebind.done); -retry: - idle_rebind.cnt = 1; - INIT_COMPLETION(idle_rebind.done); - - /* set REBIND and kick idle ones, we'll wait for these later */ + /* dequeue and kick idle ones */ for_each_worker_pool(pool, gcwq) { - list_for_each_entry(worker, &pool->idle_list, entry) { - unsigned long worker_flags = worker->flags; - - if (worker->flags & WORKER_REBIND) - continue; - - /* morph UNBOUND to REBIND atomically */ - worker_flags &= ~WORKER_UNBOUND; - worker_flags |= WORKER_REBIND; - ACCESS_ONCE(worker->flags) = worker_flags; - - idle_rebind.cnt++; - worker->idle_rebind = &idle_rebind; + list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { + /* + * idle workers should be off @pool->idle_list + * until rebind is complete to avoid receiving + * premature local wake-ups. + */ + list_del_init(&worker->entry); - /* worker_thread() will call idle_worker_rebind() */ + /* + * worker_thread() will see the above dequeuing + * and call idle_worker_rebind(). + */ wake_up_process(worker->task); } } - if (--idle_rebind.cnt) { - spin_unlock_irq(&gcwq->lock); - wait_for_completion(&idle_rebind.done); - spin_lock_irq(&gcwq->lock); - /* busy ones might have become idle while waiting, retry */ - goto retry; - } - - /* all idle workers are rebound, rebind busy workers */ + /* rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; - unsigned long worker_flags = worker->flags; - - /* morph UNBOUND to REBIND atomically */ - worker_flags &= ~WORKER_UNBOUND; - worker_flags |= WORKER_REBIND; - ACCESS_ONCE(worker->flags) = worker_flags; + struct workqueue_struct *wq; if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(rebind_work))) continue; - /* wq doesn't matter, use the default one */ debug_work_activate(rebind_work); - insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } - - /* - * All idle workers are rebound and waiting for %WORKER_REBIND to - * be cleared inside idle_worker_rebind(). Clear and release. - * Clearing %WORKER_REBIND from this foreign context is safe - * because these workers are still guaranteed to be idle. - * - * We need to make sure all idle workers passed WORKER_REBIND wait - * in idle_worker_rebind() before returning; otherwise, workers can - * get stuck at the wait if hotplug cycle repeats. - */ - idle_rebind.cnt = 1; - INIT_COMPLETION(idle_rebind.done); - - for_each_worker_pool(pool, gcwq) { - list_for_each_entry(worker, &pool->idle_list, entry) { - worker->flags &= ~WORKER_REBIND; - idle_rebind.cnt++; - } - } - wake_up_all(&gcwq->rebind_hold); + /* + * wq doesn't really matter but let's keep @worker->pool + * and @cwq->pool consistent for sanity. + */ + if (worker_pool_pri(worker->pool)) + wq = system_highpri_wq; + else + wq = system_wq; - if (--idle_rebind.cnt) { - spin_unlock_irq(&gcwq->lock); - wait_for_completion(&idle_rebind.done); - spin_lock_irq(&gcwq->lock); + insert_work(get_cwq(gcwq->cpu, wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); } } @@ -1844,22 +2093,22 @@ static bool manage_workers(struct worker *worker) * grab %POOL_MANAGING_WORKERS to achieve this because that can * lead to idle worker depletion (all become busy thinking someone * else is managing) which in turn can result in deadlock under - * extreme circumstances. Use @pool->manager_mutex to synchronize + * extreme circumstances. Use @pool->assoc_mutex to synchronize * manager against CPU hotplug. * - * manager_mutex would always be free unless CPU hotplug is in + * assoc_mutex would always be free unless CPU hotplug is in * progress. trylock first without dropping @gcwq->lock. */ - if (unlikely(!mutex_trylock(&pool->manager_mutex))) { + if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { spin_unlock_irq(&pool->gcwq->lock); - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->assoc_mutex); /* * CPU hotplug could have happened while we were waiting - * for manager_mutex. Hotplug itself can't handle us + * for assoc_mutex. Hotplug itself can't handle us * because manager isn't either on idle or busy list, and * @gcwq's state and ours could have deviated. * - * As hotplug is now excluded via manager_mutex, we can + * As hotplug is now excluded via assoc_mutex, we can * simply try to bind. It will succeed or fail depending * on @gcwq's current state. Try it and adjust * %WORKER_UNBOUND accordingly. @@ -1882,112 +2131,11 @@ static bool manage_workers(struct worker *worker) ret |= maybe_create_worker(pool); pool->flags &= ~POOL_MANAGING_WORKERS; - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->assoc_mutex); return ret; } /** - * move_linked_works - move linked works to a list - * @work: start of series of works to be scheduled - * @head: target list to append @work to - * @nextp: out paramter for nested worklist walking - * - * Schedule linked works starting from @work to @head. Work series to - * be scheduled starts at @work and includes any consecutive work with - * WORK_STRUCT_LINKED set in its predecessor. - * - * If @nextp is not NULL, it's updated to point to the next work of - * the last scheduled work. This allows move_linked_works() to be - * nested inside outer list_for_each_entry_safe(). - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void move_linked_works(struct work_struct *work, struct list_head *head, - struct work_struct **nextp) -{ - struct work_struct *n; - - /* - * Linked worklist will always end before the end of the list, - * use NULL for list head. - */ - list_for_each_entry_safe_from(work, n, NULL, entry) { - list_move_tail(&work->entry, head); - if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) - break; - } - - /* - * If we're already inside safe list traversal and have moved - * multiple works to the scheduled queue, the next position - * needs to be updated. - */ - if (nextp) - *nextp = n; -} - -static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) -{ - struct work_struct *work = list_first_entry(&cwq->delayed_works, - struct work_struct, entry); - - trace_workqueue_activate_work(work); - move_linked_works(work, &cwq->pool->worklist, NULL); - __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); - cwq->nr_active++; -} - -/** - * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight - * @cwq: cwq of interest - * @color: color of work which left the queue - * @delayed: for a delayed work - * - * A work either has completed or is removed from pending queue, - * decrement nr_in_flight of its cwq and handle workqueue flushing. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, - bool delayed) -{ - /* ignore uncolored works */ - if (color == WORK_NO_COLOR) - return; - - cwq->nr_in_flight[color]--; - - if (!delayed) { - cwq->nr_active--; - if (!list_empty(&cwq->delayed_works)) { - /* one down, submit a delayed one */ - if (cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); - } - } - - /* is flush in progress and are we at the flushing tip? */ - if (likely(cwq->flush_color != color)) - return; - - /* are there still in-flight works? */ - if (cwq->nr_in_flight[color]) - return; - - /* this cwq is done, clear flush_color */ - cwq->flush_color = -1; - - /* - * If this was the last cwq, wake up the first flusher. It - * will handle the rest. - */ - if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) - complete(&cwq->wq->first_flusher->done); -} - -/** * process_one_work - process single work * @worker: self * @work: work to process @@ -2030,7 +2178,7 @@ __acquires(&gcwq->lock) * necessary to avoid spurious warnings from rescuers servicing the * unbound or a disassociated gcwq. */ - WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && + WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && !(gcwq->flags & GCWQ_DISASSOCIATED) && raw_smp_processor_id() != gcwq->cpu); @@ -2046,15 +2194,13 @@ __acquires(&gcwq->lock) return; } - /* claim and process */ + /* claim and dequeue */ debug_work_deactivate(work); hlist_add_head(&worker->hentry, bwh); worker->current_work = work; worker->current_cwq = cwq; work_color = get_work_color(work); - /* record the current cpu number in the work data and dequeue */ - set_work_cpu(work, gcwq->cpu); list_del_init(&work->entry); /* @@ -2071,9 +2217,16 @@ __acquires(&gcwq->lock) if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) wake_up_worker(pool); + /* + * Record the last CPU and clear PENDING which should be the last + * update to @work. Also, do this inside @gcwq->lock so that + * PENDING and queued state changes happen together while IRQ is + * disabled. + */ + set_work_cpu_and_clear_pending(work, gcwq->cpu); + spin_unlock_irq(&gcwq->lock); - work_clear_pending(work); lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); @@ -2087,11 +2240,9 @@ __acquires(&gcwq->lock) lock_map_release(&cwq->wq->lockdep_map); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), task_pid_nr(current)); - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); + pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" + " last function: %pf\n", + current->comm, preempt_count(), task_pid_nr(current), f); debug_show_held_locks(current); dump_stack(); } @@ -2106,7 +2257,7 @@ __acquires(&gcwq->lock) hlist_del_init(&worker->hentry); worker->current_work = NULL; worker->current_cwq = NULL; - cwq_dec_nr_in_flight(cwq, work_color, false); + cwq_dec_nr_in_flight(cwq, work_color); } /** @@ -2151,18 +2302,17 @@ static int worker_thread(void *__worker) woke_up: spin_lock_irq(&gcwq->lock); - /* - * DIE can be set only while idle and REBIND set while busy has - * @worker->rebind_work scheduled. Checking here is enough. - */ - if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) { + /* we are off idle list if destruction or rebind is requested */ + if (unlikely(list_empty(&worker->entry))) { spin_unlock_irq(&gcwq->lock); + /* if DIE is set, destruction is requested */ if (worker->flags & WORKER_DIE) { worker->task->flags &= ~PF_WQ_WORKER; return 0; } + /* otherwise, rebind */ idle_worker_rebind(worker); goto woke_up; } @@ -2645,8 +2795,8 @@ reflush: if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", - wq->name, flush_cnt); + pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", + wq->name, flush_cnt); goto reflush; } @@ -2657,8 +2807,7 @@ reflush: } EXPORT_SYMBOL_GPL(drain_workqueue); -static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, - bool wait_executing) +static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) { struct worker *worker = NULL; struct global_cwq *gcwq; @@ -2680,13 +2829,12 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, cwq = get_work_cwq(work); if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) goto already_gone; - } else if (wait_executing) { + } else { worker = find_worker_executing_work(gcwq, work); if (!worker) goto already_gone; cwq = worker->current_cwq; - } else - goto already_gone; + } insert_wq_barrier(cwq, barr, work, worker); spin_unlock_irq(&gcwq->lock); @@ -2713,15 +2861,8 @@ already_gone: * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush * - * Wait until @work has finished execution. This function considers - * only the last queueing instance of @work. If @work has been - * enqueued across different CPUs on a non-reentrant workqueue or on - * multiple workqueues, @work might still be executing on return on - * some of the CPUs from earlier queueing. - * - * If @work was queued only on a non-reentrant, ordered or unbound - * workqueue, @work is guaranteed to be idle on return if it hasn't - * been requeued since flush started. + * Wait until @work has finished execution. @work is guaranteed to be idle + * on return if it hasn't been requeued since flush started. * * RETURNS: * %true if flush_work() waited for the work to finish execution, @@ -2734,140 +2875,36 @@ bool flush_work(struct work_struct *work) lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); - if (start_flush_work(work, &barr, true)) { + if (start_flush_work(work, &barr)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); return true; - } else - return false; -} -EXPORT_SYMBOL_GPL(flush_work); - -static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) -{ - struct wq_barrier barr; - struct worker *worker; - - spin_lock_irq(&gcwq->lock); - - worker = find_worker_executing_work(gcwq, work); - if (unlikely(worker)) - insert_wq_barrier(worker->current_cwq, &barr, work, worker); - - spin_unlock_irq(&gcwq->lock); - - if (unlikely(worker)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else + } else { return false; -} - -static bool wait_on_work(struct work_struct *work) -{ - bool ret = false; - int cpu; - - might_sleep(); - - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - - for_each_gcwq_cpu(cpu) - ret |= wait_on_cpu_work(get_gcwq(cpu), work); - return ret; -} - -/** - * flush_work_sync - wait until a work has finished execution - * @work: the work to flush - * - * Wait until @work has finished execution. On return, it's - * guaranteed that all queueing instances of @work which happened - * before this function is called are finished. In other words, if - * @work hasn't been requeued since this function was called, @work is - * guaranteed to be idle on return. - * - * RETURNS: - * %true if flush_work_sync() waited for the work to finish execution, - * %false if it was already idle. - */ -bool flush_work_sync(struct work_struct *work) -{ - struct wq_barrier barr; - bool pending, waited; - - /* we'll wait for executions separately, queue barr only if pending */ - pending = start_flush_work(work, &barr, false); - - /* wait for executions to finish */ - waited = wait_on_work(work); - - /* wait for the pending one */ - if (pending) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); } - - return pending || waited; -} -EXPORT_SYMBOL_GPL(flush_work_sync); - -/* - * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, - * so this work can't be re-armed in any way. - */ -static int try_to_grab_pending(struct work_struct *work) -{ - struct global_cwq *gcwq; - int ret = -1; - - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) - return 0; - - /* - * The queueing is in progress, or it is already queued. Try to - * steal it from ->worklist without clearing WORK_STRUCT_PENDING. - */ - gcwq = get_work_gcwq(work); - if (!gcwq) - return ret; - - spin_lock_irq(&gcwq->lock); - if (!list_empty(&work->entry)) { - /* - * This work is queued, but perhaps we locked the wrong gcwq. - * In that case we must see the new value after rmb(), see - * insert_work()->wmb(). - */ - smp_rmb(); - if (gcwq == get_work_gcwq(work)) { - debug_work_deactivate(work); - list_del_init(&work->entry); - cwq_dec_nr_in_flight(get_work_cwq(work), - get_work_color(work), - *work_data_bits(work) & WORK_STRUCT_DELAYED); - ret = 1; - } - } - spin_unlock_irq(&gcwq->lock); - - return ret; } +EXPORT_SYMBOL_GPL(flush_work); -static bool __cancel_work_timer(struct work_struct *work, - struct timer_list* timer) +static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { + unsigned long flags; int ret; do { - ret = (timer && likely(del_timer(timer))); - if (!ret) - ret = try_to_grab_pending(work); - wait_on_work(work); + ret = try_to_grab_pending(work, is_dwork, &flags); + /* + * If someone else is canceling, wait for the same event it + * would be waiting for before retrying. + */ + if (unlikely(ret == -ENOENT)) + flush_work(work); } while (unlikely(ret < 0)); + /* tell other tasks trying to grab @work to back off */ + mark_work_canceling(work); + local_irq_restore(flags); + + flush_work(work); clear_work_data(work); return ret; } @@ -2892,7 +2929,7 @@ static bool __cancel_work_timer(struct work_struct *work, */ bool cancel_work_sync(struct work_struct *work) { - return __cancel_work_timer(work, NULL); + return __cancel_work_timer(work, false); } EXPORT_SYMBOL_GPL(cancel_work_sync); @@ -2910,33 +2947,44 @@ EXPORT_SYMBOL_GPL(cancel_work_sync); */ bool flush_delayed_work(struct delayed_work *dwork) { + local_irq_disable(); if (del_timer_sync(&dwork->timer)) - __queue_work(raw_smp_processor_id(), + __queue_work(dwork->cpu, get_work_cwq(&dwork->work)->wq, &dwork->work); + local_irq_enable(); return flush_work(&dwork->work); } EXPORT_SYMBOL(flush_delayed_work); /** - * flush_delayed_work_sync - wait for a dwork to finish - * @dwork: the delayed work to flush + * cancel_delayed_work - cancel a delayed work + * @dwork: delayed_work to cancel * - * Delayed timer is cancelled and the pending work is queued for - * execution immediately. Other than timer handling, its behavior - * is identical to flush_work_sync(). + * Kill off a pending delayed_work. Returns %true if @dwork was pending + * and canceled; %false if wasn't pending. Note that the work callback + * function may still be running on return, unless it returns %true and the + * work doesn't re-arm itself. Explicitly flush or use + * cancel_delayed_work_sync() to wait on it. * - * RETURNS: - * %true if flush_work_sync() waited for the work to finish execution, - * %false if it was already idle. + * This function is safe to call from any context including IRQ handler. */ -bool flush_delayed_work_sync(struct delayed_work *dwork) +bool cancel_delayed_work(struct delayed_work *dwork) { - if (del_timer_sync(&dwork->timer)) - __queue_work(raw_smp_processor_id(), - get_work_cwq(&dwork->work)->wq, &dwork->work); - return flush_work_sync(&dwork->work); + unsigned long flags; + int ret; + + do { + ret = try_to_grab_pending(&dwork->work, true, &flags); + } while (unlikely(ret == -EAGAIN)); + + if (unlikely(ret < 0)) + return false; + + set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); + local_irq_restore(flags); + return true; } -EXPORT_SYMBOL(flush_delayed_work_sync); +EXPORT_SYMBOL(cancel_delayed_work); /** * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish @@ -2949,54 +2997,39 @@ EXPORT_SYMBOL(flush_delayed_work_sync); */ bool cancel_delayed_work_sync(struct delayed_work *dwork) { - return __cancel_work_timer(&dwork->work, &dwork->timer); + return __cancel_work_timer(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_work - put work task in global workqueue - * @work: job to be done - * - * Returns zero if @work was already on the kernel-global workqueue and - * non-zero otherwise. - * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global - * workqueue otherwise. - */ -int schedule_work(struct work_struct *work) -{ - return queue_work(system_wq, work); -} -EXPORT_SYMBOL(schedule_work); - -/* * schedule_work_on - put work task on a specific cpu * @cpu: cpu to put the work task on * @work: job to be done * * This puts a job on a specific cpu */ -int schedule_work_on(int cpu, struct work_struct *work) +bool schedule_work_on(int cpu, struct work_struct *work) { return queue_work_on(cpu, system_wq, work); } EXPORT_SYMBOL(schedule_work_on); /** - * schedule_delayed_work - put work task in global workqueue after delay - * @dwork: job to be done - * @delay: number of jiffies to wait or 0 for immediate execution + * schedule_work - put work task in global workqueue + * @work: job to be done * - * After waiting for a given time this puts a job in the kernel-global - * workqueue. + * Returns %false if @work was already on the kernel-global workqueue and + * %true otherwise. + * + * This puts a job in the kernel-global workqueue if it was not already + * queued and leaves it in the same position on the kernel-global + * workqueue otherwise. */ -int schedule_delayed_work(struct delayed_work *dwork, - unsigned long delay) +bool schedule_work(struct work_struct *work) { - return queue_delayed_work(system_wq, dwork, delay); + return queue_work(system_wq, work); } -EXPORT_SYMBOL(schedule_delayed_work); +EXPORT_SYMBOL(schedule_work); /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay @@ -3007,14 +3040,28 @@ EXPORT_SYMBOL(schedule_delayed_work); * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ -int schedule_delayed_work_on(int cpu, - struct delayed_work *dwork, unsigned long delay) +bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, + unsigned long delay) { return queue_delayed_work_on(cpu, system_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work_on); /** + * schedule_delayed_work - put work task in global workqueue after delay + * @dwork: job to be done + * @delay: number of jiffies to wait or 0 for immediate execution + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ +bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) +{ + return queue_delayed_work(system_wq, dwork, delay); +} +EXPORT_SYMBOL(schedule_delayed_work); + +/** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call * @@ -3161,9 +3208,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; if (max_active < 1 || max_active > lim) - printk(KERN_WARNING "workqueue: max_active %d requested for %s " - "is out of range, clamping between %d and %d\n", - max_active, name, 1, lim); + pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", + max_active, name, 1, lim); return clamp_val(max_active, 1, lim); } @@ -3319,6 +3365,26 @@ void destroy_workqueue(struct workqueue_struct *wq) EXPORT_SYMBOL_GPL(destroy_workqueue); /** + * cwq_set_max_active - adjust max_active of a cwq + * @cwq: target cpu_workqueue_struct + * @max_active: new max_active value. + * + * Set @cwq->max_active to @max_active and activate delayed works if + * increased. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) +{ + cwq->max_active = max_active; + + while (!list_empty(&cwq->delayed_works) && + cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); +} + +/** * workqueue_set_max_active - adjust max_active of a workqueue * @wq: target workqueue * @max_active: new max_active value. @@ -3345,7 +3411,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) if (!(wq->flags & WQ_FREEZABLE) || !(gcwq->flags & GCWQ_FREEZING)) - get_cwq(gcwq->cpu, wq)->max_active = max_active; + cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); spin_unlock_irq(&gcwq->lock); } @@ -3440,23 +3506,23 @@ EXPORT_SYMBOL_GPL(work_busy); */ /* claim manager positions of all pools */ -static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) +static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq) { struct worker_pool *pool; for_each_worker_pool(pool, gcwq) - mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); + mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools); spin_lock_irq(&gcwq->lock); } /* release manager positions */ -static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) +static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq) { struct worker_pool *pool; spin_unlock_irq(&gcwq->lock); for_each_worker_pool(pool, gcwq) - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->assoc_mutex); } static void gcwq_unbind_fn(struct work_struct *work) @@ -3469,7 +3535,7 @@ static void gcwq_unbind_fn(struct work_struct *work) BUG_ON(gcwq->cpu != smp_processor_id()); - gcwq_claim_management_and_lock(gcwq); + gcwq_claim_assoc_and_lock(gcwq); /* * We've claimed all manager positions. Make all workers unbound @@ -3486,7 +3552,7 @@ static void gcwq_unbind_fn(struct work_struct *work) gcwq->flags |= GCWQ_DISASSOCIATED; - gcwq_release_management_and_unlock(gcwq); + gcwq_release_assoc_and_unlock(gcwq); /* * Call schedule() so that we cross rq->lock and thus can guarantee @@ -3514,7 +3580,7 @@ static void gcwq_unbind_fn(struct work_struct *work) * Workqueues should be brought up before normal priority CPU notifiers. * This will be registered high priority CPU notifier. */ -static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, +static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -3542,10 +3608,10 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - gcwq_claim_management_and_lock(gcwq); + gcwq_claim_assoc_and_lock(gcwq); gcwq->flags &= ~GCWQ_DISASSOCIATED; rebind_workers(gcwq); - gcwq_release_management_and_unlock(gcwq); + gcwq_release_assoc_and_unlock(gcwq); break; } return NOTIFY_OK; @@ -3555,7 +3621,7 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, * Workqueues should be brought down after normal priority CPU notifiers. * This will be registered as low priority CPU notifier. */ -static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, +static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -3566,7 +3632,7 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: /* unbinding should happen on the local CPU */ INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); - schedule_work_on(cpu, &unbind_work); + queue_work_on(cpu, system_highpri_wq, &unbind_work); flush_work(&unbind_work); break; } @@ -3735,11 +3801,7 @@ void thaw_workqueues(void) continue; /* restore max_active and repopulate worklist */ - cwq->max_active = wq->saved_max_active; - - while (!list_empty(&cwq->delayed_works) && - cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); + cwq_set_max_active(cwq, wq->saved_max_active); } for_each_worker_pool(pool, gcwq) @@ -3759,8 +3821,12 @@ static int __init init_workqueues(void) unsigned int cpu; int i; + /* make sure we have enough bits for OFFQ CPU number */ + BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < + WORK_CPU_LAST); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); - cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); + hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { @@ -3786,11 +3852,9 @@ static int __init init_workqueues(void) setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, (unsigned long)pool); - mutex_init(&pool->manager_mutex); + mutex_init(&pool->assoc_mutex); ida_init(&pool->worker_ida); } - - init_waitqueue_head(&gcwq->rebind_hold); } /* create the initial worker */ @@ -3813,17 +3877,14 @@ static int __init init_workqueues(void) } system_wq = alloc_workqueue("events", 0, 0); + system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); - system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); - system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", - WQ_NON_REENTRANT | WQ_FREEZABLE, 0); - BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || - !system_unbound_wq || !system_freezable_wq || - !system_nrt_freezable_wq); + BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || + !system_unbound_wq || !system_freezable_wq); return 0; } early_initcall(init_workqueues); |