From 864c2357ca898c6171fe5284f5ecc795c8ce27a8 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Tue, 1 Nov 2016 11:52:58 -0700 Subject: perf/core: Do not set cpuctx->cgrp for unscheduled cgroups Commit: db4a835601b7 ("perf/core: Set cgroup in CPU contexts for new cgroup events") failed to verify that event->cgrp is actually the scheduled cgroup in a CPU before setting cpuctx->cgrp. This patch fixes that. Now that there is a different path for scheduled and unscheduled cgroup, add a warning to catch when cpuctx->cgrp is still set after the last cgroup event has been unsheduled. To verify the bug: # Create 2 cgroups. mkdir /dev/cgroups/devices/g1 mkdir /dev/cgroups/devices/g2 # launch a task, bind it to a cpu and move it to g1 CPU=2 while :; do : ; done & P=$! taskset -pc $CPU $P echo $P > /dev/cgroups/devices/g1/tasks # monitor g2 (it runs no tasks) and observe output perf stat -e cycles -I 1000 -C $CPU -G g2 # time counts unit events 1.000091408 7,579,527 cycles g2 2.000350111 cycles g2 3.000589181 cycles g2 4.000771428 cycles g2 # note first line that displays that a task run in g2, despite # g2 having no tasks. This is because cpuctx->cgrp was wrongly # set when context of new event was installed. # After applying the fix we obtain the right output: perf stat -e cycles -I 1000 -C $CPU -G g2 # time counts unit events 1.000119615 cycles g2 2.000389430 cycles g2 3.000590962 cycles g2 Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Nilay Vaish Cc: Paul Turner Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Link: http://lkml.kernel.org/r/1478026378-86083-1-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0e292132efac..ff230bb4a02e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -902,6 +902,17 @@ list_update_cgroup_event(struct perf_event *event, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); + + /* Only set/clear cpuctx->cgrp if current task uses event->cgrp. */ + if (perf_cgroup_from_task(current, ctx) != event->cgrp) { + /* + * We are removing the last cpu event in this context. + * If that event is not active in this cpu, cpuctx->cgrp + * should've been cleared by perf_cgroup_switch. + */ + WARN_ON_ONCE(!add && cpuctx->cgrp); + return; + } cpuctx->cgrp = add ? event->cgrp : NULL; } -- cgit v1.2.3 From e96271f3ed7e702fa36dd0605c0c5b5f065af816 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 18 Nov 2016 13:38:43 +0200 Subject: perf/core: Fix address filter parser The token table passed into match_token() must be null-terminated, which it currently is not in the perf's address filter string parser, as caught by Vince's perf_fuzzer and KASAN. It doesn't blow up otherwise because of the alignment padding of the table to the next element in the .rodata, which is luck. Fixing by adding a null-terminator to the token table. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: stable@vger.kernel.org # v4.7+ Fixes: 375637bc524 ("perf/core: Introduce address range filtering") Link: http://lkml.kernel.org/r/877f81f264.fsf@ashishki-desk.ger.corp.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ff230bb4a02e..6ee1febdf6ff 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8029,6 +8029,7 @@ restart: * if is not specified, the range is treated as a single address. */ enum { + IF_ACT_NONE = -1, IF_ACT_FILTER, IF_ACT_START, IF_ACT_STOP, @@ -8052,6 +8053,7 @@ static const match_table_t if_tokens = { { IF_SRC_KERNEL, "%u/%u" }, { IF_SRC_FILEADDR, "%u@%s" }, { IF_SRC_KERNELADDR, "%u" }, + { IF_ACT_NONE, NULL }, }; /* -- cgit v1.2.3 From 18f649ef344127ef6de23a5a4272dbe2fdb73dde Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 14 Nov 2016 19:46:09 +0100 Subject: sched/autogroup: Fix autogroup_move_group() to never skip sched_move_task() The PF_EXITING check in task_wants_autogroup() is no longer needed. Remove it, but see the next patch. However the comment is correct in that autogroup_move_group() must always change task_group() for every thread so the sysctl_ check is very wrong; we can race with cgroups and even sys_setsid() is not safe because a task running with task_group() == ag->tg must participate in refcounting: int main(void) { int sctl = open("/proc/sys/kernel/sched_autogroup_enabled", O_WRONLY); assert(sctl > 0); if (fork()) { wait(NULL); // destroy the child's ag/tg pause(); } assert(pwrite(sctl, "1\n", 2, 0) == 2); assert(setsid() > 0); if (fork()) pause(); kill(getppid(), SIGKILL); sleep(1); // The child has gone, the grandchild runs with kref == 1 assert(pwrite(sctl, "0\n", 2, 0) == 2); assert(setsid() > 0); // runs with the freed ag/tg for (;;) sleep(1); return 0; } crashes the kernel. It doesn't really need sleep(1), it doesn't matter if autogroup_move_group() actually frees the task_group or this happens later. Reported-by: Vern Lovejoy Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hartsjc@redhat.com Cc: vbendel@redhat.com Link: http://lkml.kernel.org/r/20161114184609.GA15965@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/auto_group.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index a5d966cb8891..ad2b19ad6ca0 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -111,14 +111,11 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { if (tg != &root_task_group) return false; - /* - * We can only assume the task group can't go away on us if - * autogroup_move_group() can see us on ->thread_group list. + * If we race with autogroup_move_group() the caller can use the old + * value of signal->autogroup but in this case sched_move_task() will + * be called again before autogroup_kref_put(). */ - if (p->flags & PF_EXITING) - return false; - return true; } @@ -138,13 +135,17 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) } p->signal->autogroup = autogroup_kref_get(ag); - - if (!READ_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - + /* + * We can't avoid sched_move_task() after we changed signal->autogroup, + * this process can already run with task_group() == prev->tg or we can + * race with cgroup code which can read autogroup = prev under rq->lock. + * In the latter case for_each_thread() can not miss a migrating thread, + * cpu_cgroup_attach() must not be possible after cgroup_exit() and it + * can't be removed from thread list, we hold ->siglock. + */ for_each_thread(p, t) sched_move_task(t); -out: + unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } -- cgit v1.2.3 From 8e5bfa8c1f8471aa4a2d30be631ef2b50e10abaf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 14 Nov 2016 19:46:12 +0100 Subject: sched/autogroup: Do not use autogroup->tg in zombie threads Exactly because for_each_thread() in autogroup_move_group() can't see it and update its ->sched_task_group before _put() and possibly free(). So the exiting task needs another sched_move_task() before exit_notify() and we need to re-introduce the PF_EXITING (or similar) check removed by the previous change for another reason. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hartsjc@redhat.com Cc: vbendel@redhat.com Cc: vlovejoy@redhat.com Link: http://lkml.kernel.org/r/20161114184612.GA15968@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/exit.c | 1 + kernel/sched/auto_group.c | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 348f51b0ec92..e9c009dc3a4a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2567,6 +2567,7 @@ extern void sched_autogroup_create_attach(struct task_struct *p); extern void sched_autogroup_detach(struct task_struct *p); extern void sched_autogroup_fork(struct signal_struct *sig); extern void sched_autogroup_exit(struct signal_struct *sig); +extern void sched_autogroup_exit_task(struct task_struct *p); #ifdef CONFIG_PROC_FS extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); @@ -2576,6 +2577,7 @@ static inline void sched_autogroup_create_attach(struct task_struct *p) { } static inline void sched_autogroup_detach(struct task_struct *p) { } static inline void sched_autogroup_fork(struct signal_struct *sig) { } static inline void sched_autogroup_exit(struct signal_struct *sig) { } +static inline void sched_autogroup_exit_task(struct task_struct *p) { } #endif extern int yield_to(struct task_struct *p, bool preempt); diff --git a/kernel/exit.c b/kernel/exit.c index 9d68c45ebbe3..3076f3089919 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -836,6 +836,7 @@ void __noreturn do_exit(long code) */ perf_event_exit_task(tsk); + sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index ad2b19ad6ca0..f1c8fd566246 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -115,10 +115,26 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) * If we race with autogroup_move_group() the caller can use the old * value of signal->autogroup but in this case sched_move_task() will * be called again before autogroup_kref_put(). + * + * However, there is no way sched_autogroup_exit_task() could tell us + * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case. */ + if (p->flags & PF_EXITING) + return false; + return true; } +void sched_autogroup_exit_task(struct task_struct *p) +{ + /* + * We are going to call exit_notify() and autogroup_move_group() can't + * see this thread after that: we can no longer use signal->autogroup. + * See the PF_EXITING check in task_wants_autogroup(). + */ + sched_move_task(p); +} + static void autogroup_move_group(struct task_struct *p, struct autogroup *ag) { @@ -142,6 +158,9 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * In the latter case for_each_thread() can not miss a migrating thread, * cpu_cgroup_attach() must not be possible after cgroup_exit() and it * can't be removed from thread list, we hold ->siglock. + * + * If an exiting thread was already removed from thread list we rely on + * sched_autogroup_exit_task(). */ for_each_thread(p, t) sched_move_task(t); -- cgit v1.2.3