diff options
-rw-r--r-- | Documentation/kernel-parameters.txt | 5 | ||||
-rw-r--r-- | Documentation/sysctl/kernel.txt | 8 | ||||
-rw-r--r-- | include/linux/latencytop.h | 3 | ||||
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | include/linux/sched/sysctl.h | 4 | ||||
-rw-r--r-- | kernel/latencytop.c | 14 | ||||
-rw-r--r-- | kernel/profile.c | 1 | ||||
-rw-r--r-- | kernel/sched/core.c | 70 | ||||
-rw-r--r-- | kernel/sched/debug.c | 102 | ||||
-rw-r--r-- | kernel/sched/fair.c | 113 | ||||
-rw-r--r-- | kernel/sched/sched.h | 1 | ||||
-rw-r--r-- | kernel/sched/stats.h | 8 | ||||
-rw-r--r-- | kernel/sysctl.c | 13 |
13 files changed, 256 insertions, 90 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 551ecf09c8dd..ed47b609530b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3528,6 +3528,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. sched_debug [KNL] Enables verbose scheduler debug messages. + schedstats= [KNL,X86] Enable or disable scheduled statistics. + Allowed values are enable and disable. This feature + incurs a small amount of overhead in the scheduler + but is useful for debugging and performance tuning. + skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate xtime_lock contention on larger systems, and/or RCU lock contention on all systems with CONFIG_MAXSMP set. diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index a93b414672a7..87119dc9bc64 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -760,6 +760,14 @@ rtsig-nr shows the number of RT signals currently queued. ============================================================== +sched_schedstats: + +Enables/disables scheduler statistics. Enabling this feature +incurs a small amount of overhead in the scheduler but is +useful for debugging and performance tuning. + +============================================================== + sg-big-buff: This file shows the size of the generic SCSI (sg) buffer. diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index e23121f9d82a..59ccab297ae0 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -37,6 +37,9 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) void clear_all_latency_tracing(struct task_struct *p); +extern int sysctl_latencytop(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + #else static inline void diff --git a/include/linux/sched.h b/include/linux/sched.h index a10494a94cc3..a292c4b7e94c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -920,6 +920,10 @@ static inline int sched_info_on(void) #endif } +#ifdef CONFIG_SCHEDSTATS +void force_schedstat_enabled(void); +#endif + enum cpu_idle_type { CPU_IDLE, CPU_NOT_IDLE, diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c9e4731cf10b..4f080ab4f2cd 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_schedstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + #endif /* _SCHED_SYSCTL_H */ diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a02812743a7e..b5c30d9f46c5 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -47,12 +47,12 @@ * of times) */ -#include <linux/latencytop.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> +#include <linux/latencytop.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/list.h> @@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void) proc_create("latency_stats", 0644, NULL, &lstats_fops); return 0; } + +int sysctl_latencytop(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err; + + err = proc_dointvec(table, write, buffer, lenp, ppos); + if (latencytop_enabled) + force_schedstat_enabled(); + + return err; +} device_initcall(init_lstats_procfs); diff --git a/kernel/profile.c b/kernel/profile.c index 99513e1160e5..51369697466e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -59,6 +59,7 @@ int profile_setup(char *str) if (!strncmp(str, sleepstr, strlen(sleepstr))) { #ifdef CONFIG_SCHEDSTATS + force_schedstat_enabled(); prof_on = SLEEP_PROFILING; if (str[strlen(sleepstr)] == ',') str += strlen(sleepstr) + 1; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 24fcdbf28b18..7e548bde67ee 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2093,7 +2093,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu); stat: - ttwu_stat(p, cpu, wake_flags); + if (schedstat_enabled()) + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2141,7 +2142,8 @@ static void try_to_wake_up_local(struct task_struct *p) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0); - ttwu_stat(p, smp_processor_id(), 0); + if (schedstat_enabled()) + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } @@ -2210,6 +2212,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif #ifdef CONFIG_SCHEDSTATS + /* Even if schedstat is disabled, there should not be garbage */ memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -2281,6 +2284,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #endif #endif +DEFINE_STATIC_KEY_FALSE(sched_schedstats); + +#ifdef CONFIG_SCHEDSTATS +static void set_schedstats(bool enabled) +{ + if (enabled) + static_branch_enable(&sched_schedstats); + else + static_branch_disable(&sched_schedstats); +} + +void force_schedstat_enabled(void) +{ + if (!schedstat_enabled()) { + pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); + static_branch_enable(&sched_schedstats); + } +} + +static int __init setup_schedstats(char *str) +{ + int ret = 0; + if (!str) + goto out; + + if (!strcmp(str, "enable")) { + set_schedstats(true); + ret = 1; + } else if (!strcmp(str, "disable")) { + set_schedstats(false); + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse schedstats=\n"); + + return ret; +} +__setup("schedstats=", setup_schedstats); + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_schedstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = static_branch_likely(&sched_schedstats); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_schedstats(state); + return err; +} +#endif +#endif + /* * fork()/clone()-time setup: */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 641511771ae6..7cfa87bd8b89 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -75,16 +75,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->vruntime); PN(se->sum_exec_runtime); #ifdef CONFIG_SCHEDSTATS - PN(se->statistics.wait_start); - PN(se->statistics.sleep_start); - PN(se->statistics.block_start); - PN(se->statistics.sleep_max); - PN(se->statistics.block_max); - PN(se->statistics.exec_max); - PN(se->statistics.slice_max); - PN(se->statistics.wait_max); - PN(se->statistics.wait_sum); - P(se->statistics.wait_count); + if (schedstat_enabled()) { + PN(se->statistics.wait_start); + PN(se->statistics.sleep_start); + PN(se->statistics.block_start); + PN(se->statistics.sleep_max); + PN(se->statistics.block_max); + PN(se->statistics.exec_max); + PN(se->statistics.slice_max); + PN(se->statistics.wait_max); + PN(se->statistics.wait_sum); + P(se->statistics.wait_count); + } #endif P(se->load.weight); #ifdef CONFIG_SMP @@ -122,10 +124,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(p->se.statistics.wait_sum), - SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(p->se.statistics.sum_sleep_runtime)); + if (schedstat_enabled()) { + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", + SPLIT_NS(p->se.statistics.wait_sum), + SPLIT_NS(p->se.sum_exec_runtime), + SPLIT_NS(p->se.statistics.sum_sleep_runtime)); + } #else SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 0LL, 0L, @@ -313,17 +317,18 @@ do { \ #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); - P(yld_count); - - P(sched_count); - P(sched_goidle); #ifdef CONFIG_SMP P64(avg_idle); P64(max_idle_balance_cost); #endif - P(ttwu_count); - P(ttwu_local); + if (schedstat_enabled()) { + P(yld_count); + P(sched_count); + P(sched_goidle); + P(ttwu_count); + P(ttwu_local); + } #undef P #undef P64 @@ -569,38 +574,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; #ifdef CONFIG_SCHEDSTATS - PN(se.statistics.sum_sleep_runtime); - PN(se.statistics.wait_start); - PN(se.statistics.sleep_start); - PN(se.statistics.block_start); - PN(se.statistics.sleep_max); - PN(se.statistics.block_max); - PN(se.statistics.exec_max); - PN(se.statistics.slice_max); - PN(se.statistics.wait_max); - PN(se.statistics.wait_sum); - P(se.statistics.wait_count); - PN(se.statistics.iowait_sum); - P(se.statistics.iowait_count); P(se.nr_migrations); - P(se.statistics.nr_migrations_cold); - P(se.statistics.nr_failed_migrations_affine); - P(se.statistics.nr_failed_migrations_running); - P(se.statistics.nr_failed_migrations_hot); - P(se.statistics.nr_forced_migrations); - P(se.statistics.nr_wakeups); - P(se.statistics.nr_wakeups_sync); - P(se.statistics.nr_wakeups_migrate); - P(se.statistics.nr_wakeups_local); - P(se.statistics.nr_wakeups_remote); - P(se.statistics.nr_wakeups_affine); - P(se.statistics.nr_wakeups_affine_attempts); - P(se.statistics.nr_wakeups_passive); - P(se.statistics.nr_wakeups_idle); - { + if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; + PN(se.statistics.sum_sleep_runtime); + PN(se.statistics.wait_start); + PN(se.statistics.sleep_start); + PN(se.statistics.block_start); + PN(se.statistics.sleep_max); + PN(se.statistics.block_max); + PN(se.statistics.exec_max); + PN(se.statistics.slice_max); + PN(se.statistics.wait_max); + PN(se.statistics.wait_sum); + P(se.statistics.wait_count); + PN(se.statistics.iowait_sum); + P(se.statistics.iowait_count); + P(se.statistics.nr_migrations_cold); + P(se.statistics.nr_failed_migrations_affine); + P(se.statistics.nr_failed_migrations_running); + P(se.statistics.nr_failed_migrations_hot); + P(se.statistics.nr_forced_migrations); + P(se.statistics.nr_wakeups); + P(se.statistics.nr_wakeups_sync); + P(se.statistics.nr_wakeups_migrate); + P(se.statistics.nr_wakeups_local); + P(se.statistics.nr_wakeups_remote); + P(se.statistics.nr_wakeups_affine); + P(se.statistics.nr_wakeups_affine_attempts); + P(se.statistics.nr_wakeups_passive); + P(se.statistics.nr_wakeups_idle); + avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56b7d4b83947..51a45502d8a6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,8 +20,8 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include <linux/latencytop.h> #include <linux/sched.h> +#include <linux/latencytop.h> #include <linux/cpumask.h> #include <linux/cpuidle.h> #include <linux/slab.h> @@ -755,7 +755,9 @@ static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct task_struct *p; - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + u64 delta; + + delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; if (entity_is_task(se)) { p = task_of(se); @@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) se->statistics.wait_sum += delta; se->statistics.wait_start = 0; } -#else -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} -#endif /* * Task is being enqueued - update stats: */ -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* * Are we enqueueing a waiting task? (for current tasks @@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) } static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Mark the end of the wait period if dequeueing a @@ -810,7 +802,40 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); + + if (flags & DEQUEUE_SLEEP) { + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + if (tsk->state & TASK_INTERRUPTIBLE) + se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); + if (tsk->state & TASK_UNINTERRUPTIBLE) + se->statistics.block_start = rq_clock(rq_of(cfs_rq)); + } + } + +} +#else +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ } +#endif /* * We are picking a new current task - update its stats: @@ -3102,6 +3127,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +static inline void check_schedstat_required(void) +{ +#ifdef CONFIG_SCHEDSTATS + if (schedstat_enabled()) + return; + + /* Force schedstat enabled if a dependent tracepoint is active */ + if (trace_sched_stat_wait_enabled() || + trace_sched_stat_sleep_enabled() || + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) { + pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " + "stat_blocked and stat_runtime require the " + "kernel parameter schedstats=enabled or " + "kernel.sched_schedstats=1\n"); + } +#endif +} + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -3122,11 +3167,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); - enqueue_sleeper(cfs_rq, se); + if (schedstat_enabled()) + enqueue_sleeper(cfs_rq, se); } - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); + check_schedstat_required(); + if (schedstat_enabled()) { + update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); + } if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; @@ -3193,19 +3242,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); dequeue_entity_load_avg(cfs_rq, se); - update_stats_dequeue(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); - - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_clock(rq_of(cfs_rq)); - } -#endif - } + if (schedstat_enabled()) + update_stats_dequeue(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -3279,7 +3317,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - update_stats_wait_end(cfs_rq, se); + if (schedstat_enabled()) + update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(se, 1); } @@ -3292,7 +3331,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ - if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { se->statistics.slice_max = max(se->statistics.slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); } @@ -3375,9 +3414,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - check_spread(cfs_rq, prev); + if (schedstat_enabled()) { + check_spread(cfs_rq, prev); + if (prev->on_rq) + update_stats_wait_start(cfs_rq, prev); + } + if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 10f16374df7f..1d583870e1a6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1022,6 +1022,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; +extern struct static_key_false sched_schedstats; static inline u64 global_rt_period(void) { diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index b0fbc7632de5..70b3b6a20fb0 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) -# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) -# define schedstat_set(var, val) do { var = (val); } while (0) +# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) +# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) +# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #else /* !CONFIG_SCHEDSTATS */ static inline void rq_sched_info_arrive(struct rq *rq, unsigned long long delta) @@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} +# define schedstat_enabled() 0 # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) # define schedstat_set(var, val) do { } while (0) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97715fd9e790..f5102fabef7f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { @@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = { .data = &latencytop_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysctl_latencytop, }, #endif #ifdef CONFIG_BLK_DEV_INITRD |