diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-01 19:12:53 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-01 19:12:53 -0700 |
commit | 3527d3e9514f013f361fba29fd71858d9361049d (patch) | |
tree | 6c46190e29a05c66b6efdaa9ba7ab2453c4bb51e | |
parent | 3711c94fd6593318146348c940d81040acf9e877 (diff) | |
parent | 21173d0b4d2a0b9e9e5f3155cf2cfc5781a6f4b1 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The main changes in this cycle were:
- another round of rq-clock handling debugging, robustization and
fixes
- PELT accounting improvements
- CPU hotplug related ->cpus_allowed affinity handling fixes all
around the tree
- ... plus misc fixes, cleanups and updates"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (35 commits)
sched/x86: Update reschedule warning text
crypto: N2 - Replace racy task affinity logic
cpufreq/sparc-us2e: Replace racy task affinity logic
cpufreq/sparc-us3: Replace racy task affinity logic
cpufreq/sh: Replace racy task affinity logic
cpufreq/ia64: Replace racy task affinity logic
ACPI/processor: Replace racy task affinity logic
ACPI/processor: Fix error handling in __acpi_processor_start()
sparc/sysfs: Replace racy task affinity logic
powerpc/smp: Replace open coded task affinity logic
ia64/sn/hwperf: Replace racy task affinity logic
ia64/salinfo: Replace racy task affinity logic
workqueue: Provide work_on_cpu_safe()
ia64/topology: Remove cpus_allowed manipulation
sched/fair: Move the PELT constants into a generated header
sched/fair: Increase PELT accuracy for small tasks
sched/fair: Fix comments
sched/Documentation: Add 'sched-pelt' tool
sched/fair: Fix corner case in __accumulate_sum()
sched/core: Remove 'task' parameter and rename tsk_restore_flags() to current_restore_flags()
...
29 files changed, 847 insertions, 544 deletions
diff --git a/Documentation/scheduler/sched-pelt.c b/Documentation/scheduler/sched-pelt.c new file mode 100644 index 000000000000..e4219139386a --- /dev/null +++ b/Documentation/scheduler/sched-pelt.c @@ -0,0 +1,108 @@ +/* + * The following program is used to generate the constants for + * computing sched averages. + * + * ============================================================== + * C program (compile with -lm) + * ============================================================== + */ + +#include <math.h> +#include <stdio.h> + +#define HALFLIFE 32 +#define SHIFT 32 + +double y; + +void calc_runnable_avg_yN_inv(void) +{ + int i; + unsigned int x; + + printf("static const u32 runnable_avg_yN_inv[] = {"); + for (i = 0; i < HALFLIFE; i++) { + x = ((1UL<<32)-1)*pow(y, i); + + if (i % 6 == 0) printf("\n\t"); + printf("0x%8x, ", x); + } + printf("\n};\n\n"); +} + +int sum = 1024; + +void calc_runnable_avg_yN_sum(void) +{ + int i; + + printf("static const u32 runnable_avg_yN_sum[] = {\n\t 0,"); + for (i = 1; i <= HALFLIFE; i++) { + if (i == 1) + sum *= y; + else + sum = sum*y + 1024*y; + + if (i % 11 == 0) + printf("\n\t"); + + printf("%5d,", sum); + } + printf("\n};\n\n"); +} + +int n = -1; +/* first period */ +long max = 1024; + +void calc_converged_max(void) +{ + long last = 0, y_inv = ((1UL<<32)-1)*y; + + for (; ; n++) { + if (n > -1) + max = ((max*y_inv)>>SHIFT) + 1024; + /* + * This is the same as: + * max = max*y + 1024; + */ + + if (last == max) + break; + + last = max; + } + n--; + printf("#define LOAD_AVG_PERIOD %d\n", HALFLIFE); + printf("#define LOAD_AVG_MAX %ld\n", max); +// printf("#define LOAD_AVG_MAX_N %d\n\n", n); +} + +void calc_accumulated_sum_32(void) +{ + int i, x = sum; + + printf("static const u32 __accumulated_sum_N32[] = {\n\t 0,"); + for (i = 1; i <= n/HALFLIFE+1; i++) { + if (i > 1) + x = x/2 + sum; + + if (i % 6 == 0) + printf("\n\t"); + + printf("%6d,", x); + } + printf("\n};\n\n"); +} + +void main(void) +{ + printf("/* Generated by Documentation/scheduler/sched-pelt; do not modify. */\n\n"); + + y = pow(0.5, 1/(double)HALFLIFE); + + calc_runnable_avg_yN_inv(); +// calc_runnable_avg_yN_sum(); + calc_converged_max(); +// calc_accumulated_sum_32(); +} diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index d194d5c83d32..63dc9cdc95c5 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -179,14 +179,14 @@ struct salinfo_platform_oemdata_parms { const u8 *efi_guid; u8 **oemdata; u64 *oemdata_size; - int ret; }; -static void +static long salinfo_platform_oemdata_cpu(void *context) { struct salinfo_platform_oemdata_parms *parms = context; - parms->ret = salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size); + + return salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size); } static void @@ -380,16 +380,7 @@ salinfo_log_release(struct inode *inode, struct file *file) return 0; } -static void -call_on_cpu(int cpu, void (*fn)(void *), void *arg) -{ - cpumask_t save_cpus_allowed = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - (*fn)(arg); - set_cpus_allowed_ptr(current, &save_cpus_allowed); -} - -static void +static long salinfo_log_read_cpu(void *context) { struct salinfo_data *data = context; @@ -399,6 +390,7 @@ salinfo_log_read_cpu(void *context) /* Clear corrected errors as they are read from SAL */ if (rh->severity == sal_log_severity_corrected) ia64_sal_clear_state_info(data->type); + return 0; } static void @@ -430,7 +422,7 @@ retry: spin_unlock_irqrestore(&data_saved_lock, flags); if (!data->saved_num) - call_on_cpu(cpu, salinfo_log_read_cpu, data); + work_on_cpu_safe(cpu, salinfo_log_read_cpu, data); if (!data->log_size) { data->state = STATE_NO_DATA; cpumask_clear_cpu(cpu, &data->cpu_event); @@ -459,11 +451,13 @@ salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *p return simple_read_from_buffer(buffer, count, ppos, buf, bufsize); } -static void +static long salinfo_log_clear_cpu(void *context) { struct salinfo_data *data = context; + ia64_sal_clear_state_info(data->type); + return 0; } static int @@ -486,7 +480,7 @@ salinfo_log_clear(struct salinfo_data *data, int cpu) rh = (sal_log_record_header_t *)(data->log_buffer); /* Corrected errors have already been cleared from SAL */ if (rh->severity != sal_log_severity_corrected) - call_on_cpu(cpu, salinfo_log_clear_cpu, data); + work_on_cpu_safe(cpu, salinfo_log_clear_cpu, data); /* clearing a record may make a new record visible */ salinfo_log_new_read(cpu, data); if (data->state == STATE_LOG_RECORD) { @@ -531,9 +525,8 @@ salinfo_log_write(struct file *file, const char __user *buffer, size_t count, lo .oemdata = &data->oemdata, .oemdata_size = &data->oemdata_size }; - call_on_cpu(cpu, salinfo_platform_oemdata_cpu, &parms); - if (parms.ret) - count = parms.ret; + count = work_on_cpu_safe(cpu, salinfo_platform_oemdata_cpu, + &parms); } else data->oemdata_size = 0; } else diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 1a68f012a6dc..d76529cbff20 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -355,18 +355,12 @@ static int cache_add_dev(unsigned int cpu) unsigned long i, j; struct cache_info *this_object; int retval = 0; - cpumask_t oldmask; if (all_cpu_cache_info[cpu].kobj.parent) return 0; - oldmask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, cpumask_of(cpu)); - if (unlikely(retval)) - return retval; retval = cpu_cache_sysfs_init(cpu); - set_cpus_allowed_ptr(current, &oldmask); if (unlikely(retval < 0)) return retval; diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c index 52704f199dd6..55febd65911a 100644 --- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c +++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c @@ -598,12 +598,17 @@ static void sn_hwperf_call_sal(void *info) op_info->ret = r; } +static long sn_hwperf_call_sal_work(void *info) +{ + sn_hwperf_call_sal(info); + return 0; +} + static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info) { u32 cpu; u32 use_ipi; int r = 0; - cpumask_t save_allowed; cpu = (op_info->a->arg & SN_HWPERF_ARG_CPU_MASK) >> 32; use_ipi = op_info->a->arg & SN_HWPERF_ARG_USE_IPI_MASK; @@ -629,13 +634,9 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info) /* use an interprocessor interrupt to call SAL */ smp_call_function_single(cpu, sn_hwperf_call_sal, op_info, 1); - } - else { - /* migrate the task before calling SAL */ - save_allowed = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - sn_hwperf_call_sal(op_info); - set_cpus_allowed_ptr(current, &save_allowed); + } else { + /* Call on the target CPU */ + work_on_cpu_safe(cpu, sn_hwperf_call_sal_work, op_info); } } r = op_info->ret; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 46f89e66a273..d68ed1f004a3 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -787,24 +787,21 @@ static struct sched_domain_topology_level powerpc_topology[] = { { NULL, }, }; -void __init smp_cpus_done(unsigned int max_cpus) +static __init long smp_setup_cpu_workfn(void *data __always_unused) { - cpumask_var_t old_mask; + smp_ops->setup_cpu(boot_cpuid); + return 0; +} - /* We want the setup_cpu() here to be called from CPU 0, but our - * init thread may have been "borrowed" by another CPU in the meantime - * se we pin us down to CPU 0 for a short while +void __init smp_cpus_done(unsigned int max_cpus) +{ + /* + * We want the setup_cpu() here to be called on the boot CPU, but + * init might run on any CPU, so make sure it's invoked on the boot + * CPU. */ - alloc_cpumask_var(&old_mask, GFP_NOWAIT); - cpumask_copy(old_mask, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpumask_of(boot_cpuid)); - if (smp_ops && smp_ops->setup_cpu) - smp_ops->setup_cpu(boot_cpuid); - - set_cpus_allowed_ptr(current, old_mask); - - free_cpumask_var(old_mask); + work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL); if (smp_ops && smp_ops->bringup_done) smp_ops->bringup_done(); @@ -812,7 +809,6 @@ void __init smp_cpus_done(unsigned int max_cpus) dump_numa_cpu_topology(); set_sched_topology(powerpc_topology); - } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c index d63fc613e7a9..5fd352b759af 100644 --- a/arch/sparc/kernel/sysfs.c +++ b/arch/sparc/kernel/sysfs.c @@ -98,27 +98,7 @@ static struct attribute_group mmu_stat_group = { .name = "mmu_stats", }; -/* XXX convert to rusty's on_one_cpu */ -static unsigned long run_on_cpu(unsigned long cpu, - unsigned long (*func)(unsigned long), - unsigned long arg) -{ - cpumask_t old_affinity; - unsigned long ret; - - cpumask_copy(&old_affinity, ¤t->cpus_allowed); - /* should return -EINVAL to userspace */ - if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) - return 0; - - ret = func(arg); - - set_cpus_allowed_ptr(current, &old_affinity); - - return ret; -} - -static unsigned long read_mmustat_enable(unsigned long junk) +static long read_mmustat_enable(void *data __maybe_unused) { unsigned long ra = 0; @@ -127,11 +107,11 @@ static unsigned long read_mmustat_enable(unsigned long junk) return ra != 0; } -static unsigned long write_mmustat_enable(unsigned long val) +static long write_mmustat_enable(void *data) { - unsigned long ra, orig_ra; + unsigned long ra, orig_ra, *val = data; - if (val) + if (*val) ra = __pa(&per_cpu(mmu_stats, smp_processor_id())); else ra = 0UL; @@ -142,7 +122,8 @@ static unsigned long write_mmustat_enable(unsigned long val) static ssize_t show_mmustat_enable(struct device *s, struct device_attribute *attr, char *buf) { - unsigned long val = run_on_cpu(s->id, read_mmustat_enable, 0); + long val = work_on_cpu(s->id, read_mmustat_enable, NULL); + return sprintf(buf, "%lx\n", val); } @@ -150,13 +131,15 @@ static ssize_t store_mmustat_enable(struct device *s, struct device_attribute *attr, const char *buf, size_t count) { - unsigned long val, err; - int ret = sscanf(buf, "%lu", &val); + unsigned long val; + long err; + int ret; + ret = sscanf(buf, "%lu", &val); if (ret != 1) return -EINVAL; - err = run_on_cpu(s->id, write_mmustat_enable, val); + err = work_on_cpu(s->id, write_mmustat_enable, &val); if (err) return -EIO; diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d3c66a15bbde..3cab8415389a 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -124,7 +124,7 @@ static bool smp_no_nmi_ipi = false; static void native_smp_send_reschedule(int cpu) { if (unlikely(cpu_is_offline(cpu))) { - WARN_ON(1); + WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); return; } apic->send_IPI(cpu, RESCHEDULE_VECTOR); diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index 9d5f0c7ed3f7..8697a82bd465 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -251,6 +251,9 @@ static int __acpi_processor_start(struct acpi_device *device) if (ACPI_SUCCESS(status)) return 0; + result = -ENODEV; + acpi_pss_perf_exit(pr, device); + err_power_exit: acpi_processor_power_exit(pr); return result; @@ -259,11 +262,16 @@ err_power_exit: static int acpi_processor_start(struct device *dev) { struct acpi_device *device = ACPI_COMPANION(dev); + int ret; if (!device) return -ENODEV; - return __acpi_processor_start(device); + /* Protect against concurrent CPU hotplug operations */ + get_online_cpus(); + ret = __acpi_processor_start(device); + put_online_cpus(); + return ret; } static int acpi_processor_stop(struct device *dev) diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index a12f96cc93ff..3de34633f7f9 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -62,8 +62,8 @@ struct acpi_processor_throttling_arg { #define THROTTLING_POSTCHANGE (2) static int acpi_processor_get_throttling(struct acpi_processor *pr); -int acpi_processor_set_throttling(struct acpi_processor *pr, - int state, bool force); +static int __acpi_processor_set_throttling(struct acpi_processor *pr, + int state, bool force, bool direct); static int acpi_processor_update_tsd_coord(void) { @@ -891,7 +891,8 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr) ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Invalid throttling state, reset\n")); state = 0; - ret = acpi_processor_set_throttling(pr, state, true); + ret = __acpi_processor_set_throttling(pr, state, true, + true); if (ret) return ret; } @@ -901,36 +902,31 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr) return 0; } -static int acpi_processor_get_throttling(struct acpi_processor *pr) +static long __acpi_processor_get_throttling(void *data) { - cpumask_var_t saved_mask; - int ret; + struct acpi_processor *pr = data; + + return pr->throttling.acpi_processor_get_throttling(pr); +} +static int acpi_processor_get_throttling(struct acpi_processor *pr) +{ if (!pr) return -EINVAL; if (!pr->flags.throttling) return -ENODEV; - if (!alloc_cpumask_var(&saved_mask, GFP_KERNEL)) - return -ENOMEM; - /* - * Migrate task to the cpu pointed by pr. + * This is either called from the CPU hotplug callback of + * processor_driver or via the ACPI probe function. In the latter + * case the CPU is not guaranteed to be online. Both call sites are + * protected against CPU hotplug. */ - cpumask_copy(saved_mask, ¤t->cpus_allowed); - /* FIXME: use work_on_cpu() */ - if (set_cpus_allowed_ptr(current, cpumask_of(pr->id))) { - /* Can't migrate to the target pr->id CPU. Exit */ - free_cpumask_var(saved_mask); + if (!cpu_online(pr->id)) return -ENODEV; - } - ret = pr->throttling.acpi_processor_get_throttling(pr); - /* restore the previous state */ - set_cpus_allowed_ptr(current, saved_mask); - free_cpumask_var(saved_mask); - return ret; + return work_on_cpu(pr->id, __acpi_processor_get_throttling, pr); } static int acpi_processor_get_fadt_info(struct acpi_processor *pr) @@ -1080,8 +1076,15 @@ static long acpi_processor_throttling_fn(void *data) arg->target_state, arg->force); } -int acpi_processor_set_throttling(struct acpi_processor *pr, - int state, bool force) +static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct) +{ + if (direct) + return fn(arg); + return work_on_cpu(cpu, fn, arg); +} + +static int __acpi_processor_set_throttling(struct acpi_processor *pr, + int state, bool force, bool direct) { int ret = 0; unsigned int i; @@ -1130,7 +1133,8 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, arg.pr = pr; arg.target_state = state; arg.force = force; - ret = work_on_cpu(pr->id, acpi_processor_throttling_fn, &arg); + ret = call_on_cpu(pr->id, acpi_processor_throttling_fn, &arg, + direct); } else { /* * When the T-state coordination is SW_ALL or HW_ALL, @@ -1163,8 +1167,8 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, arg.pr = match_pr; arg.target_state = state; arg.force = force; - ret = work_on_cpu(pr->id, acpi_processor_throttling_fn, - &arg); + ret = call_on_cpu(pr->id, acpi_processor_throttling_fn, + &arg, direct); } } /* @@ -1182,6 +1186,12 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, return ret; } +int acpi_processor_set_throttling(struct acpi_processor *pr, int state, + bool force) +{ + return __acpi_processor_set_throttling(pr, state, force, false); +} + int acpi_processor_get_throttling_info(struct acpi_processor *pr) { int result = 0; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index ac376b9b852d..56efb0444b4d 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -381,7 +381,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, *sent += result; } while (msg_data_left(&msg)); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + current_restore_flags(pflags, PF_MEMALLOC); return result; } diff --git a/drivers/cpufreq/ia64-acpi-cpufreq.c b/drivers/cpufreq/ia64-acpi-cpufreq.c index e28a31a40829..a757c0a1e7b5 100644 --- a/drivers/cpufreq/ia64-acpi-cpufreq.c +++ b/drivers/cpufreq/ia64-acpi-cpufreq.c @@ -34,6 +34,11 @@ struct cpufreq_acpi_io { unsigned int resume; }; +struct cpufreq_acpi_req { + unsigned int cpu; + unsigned int state; +}; + static struct cpufreq_acpi_io *acpi_io_data[NR_CPUS]; static struct cpufreq_driver acpi_cpufreq_driver; @@ -83,8 +88,7 @@ processor_get_pstate ( static unsigned extract_clock ( struct cpufreq_acpi_io *data, - unsigned value, - unsigned int cpu) + unsigned value) { unsigned long i; @@ -98,60 +102,43 @@ extract_clock ( } -static unsigned int +static long processor_get_freq ( - struct cpufreq_acpi_io *data, - unsigned int cpu) + void *arg) { - int ret = 0; - u32 value = 0; - cpumask_t saved_mask; - unsigned long clock_freq; + struct cpufreq_acpi_req *req = arg; + unsigned int cpu = req->cpu; + struct cpufreq_acpi_io *data = acpi_io_data[cpu]; + u32 value; + int ret; pr_debug("processor_get_freq\n"); - - saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpumask_of(cpu)); if (smp_processor_id() != cpu) - goto migrate_end; + return -EAGAIN; /* processor_get_pstate gets the instantaneous frequency */ ret = processor_get_pstate(&value); - if (ret) { - set_cpus_allowed_ptr(current, &saved_mask); pr_warn("get performance failed with error %d\n", ret); - ret = 0; - goto migrate_end; + return ret; } - clock_freq = extract_clock(data, value, cpu); - ret = (clock_freq*1000); - -migrate_end: - set_cpus_allowed_ptr(current, &saved_mask); - return ret; + return 1000 * extract_clock(data, value); } -static int +static long processor_set_freq ( - struct cpufreq_acpi_io *data, - struct cpufreq_policy *policy, - int state) + void *arg) { - int ret = 0; - u32 value = 0; - cpumask_t saved_mask; - int retval; + struct cpufreq_acpi_req *req = arg; + unsigned int cpu = req->cpu; + struct cpufreq_acpi_io *data = acpi_io_data[cpu]; + int ret, state = req->state; + u32 value; pr_debug("processor_set_freq\n"); - - saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpumask_of(policy->cpu)); - if (smp_processor_id() != policy->cpu) { - retval = -EAGAIN; - goto migrate_end; - } + if (smp_processor_id() != cpu) + return -EAGAIN; if (state == data->acpi_data.state) { if (unlikely(data->resume)) { @@ -159,8 +146,7 @@ processor_set_freq ( data->resume = 0; } else { pr_debug("Already at target state (P%d)\n", state); - retval = 0; - goto migrate_end; + return 0; } } @@ -171,7 +157,6 @@ processor_set_freq ( * First we write the target state's 'control' value to the * control_register. */ - value = (u32) data->acpi_data.states[state].control; pr_debug("Transitioning to state: 0x%08x\n", value); @@ -179,17 +164,11 @@ processor_set_freq ( ret = processor_set_pstate(value); if (ret) { pr_warn("Transition failed with error %d\n", ret); - retval = -ENODEV; - goto migrate_end; + return -ENODEV; } data->acpi_data.state = state; - - retval = 0; - -migrate_end: - set_cpus_allowed_ptr(current, &saved_mask); - return (retval); + return 0; } @@ -197,11 +176,13 @@ static unsigned int acpi_cpufreq_get ( unsigned int cpu) { - struct cpufreq_acpi_io *data = acpi_io_data[cpu]; + struct cpufreq_acpi_req req; + long ret; - pr_debug("acpi_cpufreq_get\n"); + req.cpu = cpu; + ret = work_on_cpu(cpu, processor_get_freq, &req); - return processor_get_freq(data, cpu); + return ret > 0 ? (unsigned int) ret : 0; } @@ -210,7 +191,12 @@ acpi_cpufreq_target ( struct cpufreq_policy *policy, unsigned int index) { - return processor_set_freq(acpi_io_data[policy->cpu], policy, index); + struct cpufreq_acpi_req req; + + req.cpu = policy->cpu; + req.state = index; + + return work_on_cpu(req.cpu, processor_set_freq, &req); } static int diff --git a/drivers/cpufreq/sh-cpufreq.c b/drivers/cpufreq/sh-cpufreq.c index 86628e22b2a3..719c3d9f07fb 100644 --- a/drivers/cpufreq/sh-cpufreq.c +++ b/drivers/cpufreq/sh-cpufreq.c @@ -30,54 +30,63 @@ static DEFINE_PER_CPU(struct clk, sh_cpuclk); +struct cpufreq_target { + struct cpufreq_policy *policy; + unsigned int freq; +}; + static unsigned int sh_cpufreq_get(unsigned int cpu) { return (clk_get_rate(&per_cpu(sh_cpuclk, cpu)) + 500) / 1000; } -/* - * Here we notify other drivers of the proposed change and the final change. - */ -static int sh_cpufreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) +static long __sh_cpufreq_target(void *arg) { - unsigned int cpu = policy->cpu; + struct cpufreq_target *target = arg; + struct cpufreq_policy *policy = target->policy; + int cpu = policy->cpu; struct clk *cpuclk = &per_cpu(sh_cpuclk, cpu); - cpumask_t cpus_allowed; struct cpufreq_freqs freqs; struct device *dev; long freq; - cpus_allowed = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - - BUG_ON(smp_processor_id() != cpu); + if (smp_processor_id() != cpu) + return -ENODEV; dev = get_cpu_device(cpu); /* Convert target_freq from kHz to Hz */ - freq = clk_round_rate(cpuclk, target_freq * 1000); + freq = clk_round_rate(cpuclk, target->freq * 1000); if (freq < (policy->min * 1000) || freq > (policy->max * 1000)) return -EINVAL; - dev_dbg(dev, "requested frequency %u Hz\n", target_freq * 1000); + dev_dbg(dev, "requested frequency %u Hz\n", target->freq * 1000); freqs.old = sh_cpufreq_get(cpu); freqs.new = (freq + 500) / 1000; freqs.flags = 0; - cpufreq_freq_transition_begin(policy, &freqs); - set_cpus_allowed_ptr(current, &cpus_allowed); + cpufreq_freq_transition_begin(target->policy, &freqs); clk_set_rate(cpuclk, freq); - cpufreq_freq_transition_end(policy, &freqs, 0); + cpufreq_freq_transition_end(target->policy, &freqs, 0); dev_dbg(dev, "set frequency %lu Hz\n", freq); - return 0; } +/* + * Here we notify other drivers of the proposed change and the final change. + */ +static int sh_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct cpufreq_target data = { .policy = policy, .freq = target_freq }; + + return work_on_cpu(policy->cpu, __sh_cpufreq_target, &data); +} + static int sh_cpufreq_verify(struct cpufreq_policy *policy) { struct clk *cpuclk = &per_cpu(sh_cpuclk, policy->cpu); diff --git a/drivers/cpufreq/sparc-us2e-cpufreq.c b/drivers/cpufreq/sparc-us2e-cpufreq.c index 35ddb6da93aa..90f33efee5fc 100644 --- a/drivers/cpufreq/sparc-us2e-cpufreq.c +++ b/drivers/cpufreq/sparc-us2e-cpufreq.c @@ -118,10 +118,6 @@ static void us2e_transition(unsigned long estar, unsigned long new_bits, unsigned long clock_tick, unsigned long old_divisor, unsigned long divisor) { - unsigned long flags; - - local_irq_save(flags); - estar &= ~ESTAR_MODE_DIV_MASK; /* This is based upon the state transition diagram in the IIe manual. */ @@ -152,8 +148,6 @@ static void us2e_transition(unsigned long estar, unsigned long new_bits, } else { BUG(); } - - local_irq_restore(flags); } static unsigned long index_to_estar_mode(unsigned int index) @@ -229,48 +223,51 @@ static unsigned long estar_to_divisor(unsigned long estar) return ret; } +static void __us2e_freq_get(void *arg) +{ + unsigned long *estar = arg; + + *estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR); +} + static unsigned int us2e_freq_get(unsigned int cpu) { - cpumask_t cpus_allowed; unsigned long clock_tick, estar; - cpumask_copy(&cpus_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - clock_tick = sparc64_get_clock_tick(cpu) / 1000; - estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR); - - set_cpus_allowed_ptr(current, &cpus_allowed); + if (smp_call_function_single(cpu, __us2e_freq_get, &estar, 1)) + return 0; return clock_tick / estar_to_divisor(estar); } -static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index) +static void __us2e_freq_target(void *arg) { - unsigned int cpu = policy->cpu; + unsigned int cpu = smp_processor_id(); + unsigned int *index = arg; unsigned long new_bits, new_freq; unsigned long clock_tick, divisor, old_divisor, estar; - cpumask_t cpus_allowed; - - cpumask_copy(&cpus_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpumask_of(cpu)); new_freq = clock_tick = sparc64_get_clock_tick(cpu) / 1000; - new_bits = index_to_estar_mode(index); - divisor = index_to_divisor(index); + new_bits = index_to_estar_mode(*index); + divisor = index_to_divisor(*index); new_freq /= divisor; estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR); old_divisor = estar_to_divisor(estar); - if (old_divisor != divisor) + if (old_divisor != divisor) { us2e_transition(estar, new_bits, clock_tick * 1000, old_divisor, divisor); + } +} - set_cpus_allowed_ptr(current, &cpus_allowed); +static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index) +{ + unsigned int cpu = policy->cpu; - return 0; + return smp_call_function_single(cpu, __us2e_freq_target, &index, 1); } static int __init us2e_freq_cpu_init(struct cpufreq_policy *policy) diff --git a/drivers/cpufreq/sparc-us3-cpufreq.c b/drivers/cpufreq/sparc-us3-cpufreq.c index a8d86a449ca1..30645b0118f9 100644 --- a/drivers/cpufreq/sparc-us3-cpufreq.c +++ b/drivers/cpufreq/sparc-us3-cpufreq.c @@ -35,22 +35,28 @@ static struct us3_freq_percpu_info *us3_freq_table; #define SAFARI_CFG_DIV_32 0x0000000080000000UL #define SAFARI_CFG_DIV_MASK 0x00000000C0000000UL -static unsigned long read_safari_cfg(void) +static void read_safari_cfg(void *arg) { - unsigned long ret; + unsigned long ret, *val = arg; __asm__ __volatile__("ldxa [%%g0] %1, %0" : "=&r" (ret) : "i" (ASI_SAFARI_CONFIG)); - return ret; + *val = ret; } -static void write_safari_cfg(unsigned long val) +static void update_safari_cfg(void *arg) { + unsigned long reg, *new_bits = arg; + + read_safari_cfg(®); + reg &= ~SAFARI_CFG_DIV_MASK; + reg |= *new_bits; + __asm__ __volatile__("stxa %0, [%%g0] %1\n\t" "membar #Sync" : /* no outputs */ - : "r" (val), "i" (ASI_SAFARI_CONFIG) + : "r" (reg), "i" (ASI_SAFARI_CONFIG) : "memory"); } @@ -78,29 +84,17 @@ static unsigned long get_current_freq(unsigned int cpu, unsigned long safari_cfg static unsigned int us3_freq_get(unsigned int cpu) { - cpumask_t cpus_allowed; unsigned long reg; - unsigned int ret; - - cpumask_copy(&cpus_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - - reg = read_safari_cfg(); - ret = get_current_freq(cpu, reg); - - set_cpus_allowed_ptr(current, &cpus_allowed); - return ret; + if (smp_call_function_single(cpu, read_safari_cfg, ®, 1)) + return 0; + return get_current_freq(cpu, reg); } static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index) { unsigned int cpu = policy->cpu; - unsigned long new_bits, new_freq, reg; - cpumask_t cpus_allowed; - - cpumask_copy(&cpus_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpumask_of(cpu)); + unsigned long new_bits, new_freq; new_freq = sparc64_get_clock_tick(cpu) / 1000; switch (index) { @@ -121,15 +115,7 @@ static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index) BUG(); } - reg = read_safari_cfg(); - - reg &= ~SAFARI_CFG_DIV_MASK; - reg |= new_bits; - write_safari_cfg(reg); - - set_cpus_allowed_ptr(current, &cpus_allowed); - - return 0; + return smp_call_function_single(cpu, update_safari_cfg, &new_bits, 1); } static int __init us3_freq_cpu_init(struct cpufreq_policy *policy) diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c index c5aac25a5738..4ecb77aa60e1 100644 --- a/drivers/crypto/n2_core.c +++ b/drivers/crypto/n2_core.c @@ -65,6 +65,11 @@ struct spu_queue { struct list_head list; }; +struct spu_qreg { + struct spu_queue *queue; + unsigned long type; +}; + static struct spu_queue **cpu_to_cwq; static struct spu_queue **cpu_to_mau; @@ -1631,31 +1636,27 @@ static void queue_cache_destroy(void) kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_CWQ - 1]); } -static int spu_queue_register(struct spu_queue *p, unsigned long q_type) +static long spu_queue_register_workfn(void *arg) { - cpumask_var_t old_allowed; + struct spu_qreg *qr = arg; + struct spu_queue *p = qr->queue; + unsigned long q_type = qr->type; unsigned long hv_ret; - if (cpumask_empty(&p->sharing)) - return -EINVAL; - - if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL)) - return -ENOMEM; - - cpumask_copy(old_allowed, ¤t->cpus_allowed); - - set_cpus_allowed_ptr(current, &p->sharing); - hv_ret = sun4v_ncs_qconf(q_type, __pa(p->q), CWQ_NUM_ENTRIES, &p->qhandle); if (!hv_ret) sun4v_ncs_sethead_marker(p->qhandle, 0); - set_cpus_allowed_ptr(current, old_allowed); + return hv_ret ? -EINVAL : 0; +} - free_cpumask_var(old_allowed); +static int spu_queue_register(struct spu_queue *p, unsigned long q_type) +{ + int cpu = cpumask_any_and(&p->sharing, cpu_online_mask); + struct spu_qreg qr = { .queue = p, .type = q_type }; - return (hv_ret ? -EINVAL : 0); + return work_on_cpu_safe(cpu, spu_queue_register_workfn, &qr); } static int spu_queue_setup(struct spu_queue *p) diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 4228aba1f654..bbea8eac9abb 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -387,7 +387,7 @@ static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task) rc = 0; } - tsk_restore_flags(current, pflags, PF_MEMALLOC); + current_restore_flags(pflags, PF_MEMALLOC); return rc; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 19d50f600e8d..9aaf6ca77569 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1004,7 +1004,7 @@ out_nfserr: else err = nfserrno(host_err); if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) - tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); + current_restore_flags(pflags, PF_LESS_THROTTLE); return err; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cf9a59a4d08..b9a421f1f76f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1290,10 +1290,10 @@ TASK_PFA_TEST(LMK_WAITING, lmk_waiting) TASK_PFA_SET(LMK_WAITING, lmk_waiting) static inline void -tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags) +current_restore_flags(unsigned long orig_flags, unsigned long flags) { - task->flags &= ~flags; - task->flags |= orig_flags & flags; + current->flags &= ~flags; + current->flags |= orig_flags & flags; } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index bde063cefd04..c102ef65cb64 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -608,8 +608,13 @@ static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } +static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) +{ + return fn(arg); +} #else long work_on_cpu(int cpu, long (*fn)(void *), void *arg); +long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 430b0460db89..d16db7067d6a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -86,21 +86,6 @@ int sysctl_sched_rt_runtime = 950000; cpumask_var_t cpu_isolated_map; /* - * this_rq_lock - lock this runqueue and disable interrupts. - */ -static struct rq *this_rq_lock(void) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_disable(); - rq = this_rq(); - raw_spin_lock(&rq->lock); - - return rq; -} - -/* * __task_rq_lock - lock the rq @p resides on. */ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) @@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq) return; #ifdef CONFIG_SCHED_DEBUG + if (sched_feat(WARN_DOUBLE_CLOCK)) + SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); rq->clock_update_flags |= RQCF_UPDATED; #endif + delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; if (delta < 0) return; @@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq) static enum hrtimer_restart hrtick(struct hrtimer *timer) { struct rq *rq = container_of(timer, struct rq, hrtick_timer); + struct rq_flags rf; WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); return HRTIMER_NORESTART; } @@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq) static void __hrtick_start(void *arg) { struct rq *rq = arg; + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); __hrtick_restart(rq); rq->hrtick_csd_pending = 0; - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p) static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { - update_rq_clock(rq); + if (!(flags & ENQUEUE_NOCLOCK)) + update_rq_clock(rq); + if (!(flags & ENQUEUE_RESTORE)) sched_info_queued(rq, p); + p->sched_class->enqueue_task(rq, p, flags); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { - update_rq_clock(rq); + if (!(flags & DEQUEUE_NOCLOCK)) + update_rq_clock(rq); + if (!(flags & DEQUEUE_SAVE)) sched_info_dequeued(rq, p); + p->sched_class->dequeue_task(rq, p, flags); } @@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * * Returns (locked) new rq. Old rq's lock is released. */ -static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) +static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int new_cpu) { lockdep_assert_held(&rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); rq = cpu_rq(new_cpu); - raw_spin_lock(&rq->lock); + rq_lock(rq, rf); BUG_ON(task_cpu(p) != new_cpu); enqueue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; @@ -980,7 +977,8 @@ struct migration_arg { * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. */ -static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) +static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int dest_cpu) { if (unlikely(!cpu_active(dest_cpu))) return rq; @@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) return rq; - rq = move_queued_task(rq, p, dest_cpu); + update_rq_clock(rq); + rq = move_queued_task(rq, rf, p, dest_cpu); return rq; } @@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data) struct migration_arg *arg = data; struct task_struct *p = arg->task; struct rq *rq = this_rq(); + struct rq_flags rf; /* * The original target CPU might have gone down and we might @@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data) sched_ttwu_pending(); raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because @@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data) */ if (task_rq(p) == rq) { if (task_on_rq_queued(p)) - rq = __migrate_task(rq, p, arg->dest_cpu); + rq = __migrate_task(rq, &rf, p, arg->dest_cpu); else p->wake_cpu = arg->dest_cpu; } - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); raw_spin_unlock(&p->pi_lock); local_irq_enable(); @@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * holding rq->lock. */ lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); } if (running) put_prev_task(rq, p); @@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) set_curr_task(rq, p); } @@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * OK, since we're going to drop the lock immediately * afterwards anyway. */ - rq_unpin_lock(rq, &rf); - rq = move_queued_task(rq, p, dest_cpu); - rq_repin_lock(rq, &rf); + rq = move_queued_task(rq, &rf, p, dest_cpu); } out: task_rq_unlock(rq, p, &rf); @@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) { if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; + struct rq_flags srf, drf; src_rq = task_rq(p); dst_rq = cpu_rq(cpu); + rq_pin_lock(src_rq, &srf); + rq_pin_lock(dst_rq, &drf); + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); + + rq_unpin_lock(dst_rq, &drf); + rq_unpin_lock(src_rq, &srf); + } else { /* * Task isn't running anymore; make it appear like we migrated @@ -1680,7 +1686,7 @@ static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) { - int en_flags = ENQUEUE_WAKEUP; + int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; lockdep_assert_held(&rq->lock); @@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void) struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p; - unsigned long flags; struct rq_flags rf; if (!llist) return; - raw_spin_lock_irqsave(&rq->lock, flags); - rq_pin_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); while (llist) { int wake_flags = 0; @@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void) ttwu_do_activate(rq, p, wake_flags, &rf); } - rq_unpin_lock(rq, &rf); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } void scheduler_ipi(void) @@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) void wake_up_if_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; rcu_read_lock(); @@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu) if (set_nr_if_polling(rq->idle)) { trace_sched_wake_idle_without_ipi(cpu); } else { - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); if (is_idle_task(rq->curr)) smp_send_reschedule(cpu); /* Else CPU is not idle, do nothing here: */ - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } out: @@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) } #endif - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, &rf); + rq_lock(rq, &rf); + update_rq_clock(rq); ttwu_do_activate(rq, p, wake_flags, &rf); - rq_unpin_lock(rq, &rf); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) * disabled avoiding further scheduler activity on it and we've * not yet picked a replacement task. */ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - rq_repin_lock(rq, rf); + rq_relock(rq, rf); } if (!(p->state & TASK_NORMAL)) @@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) delayacct_blkio_end(); atomic_dec(&rq->nr_iowait); } - ttwu_activate(rq, p, ENQUEUE_WAKEUP); + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); } ttwu_do_wakeup(rq, p, 0, rf); @@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(&p->se); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -3093,15 +3094,18 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; + struct rq_flags rf; sched_clock_tick(); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); + update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); - raw_spin_unlock(&rq->lock); + + rq_unlock(rq, &rf); perf_event_task_tick(); @@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, &rf); + rq_lock(rq, &rf); /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; + update_rq_clock(rq); switch_count = &prev->nivcsw; if (!preempt && prev->state) { if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { - deactivate_task(rq, prev, DEQUEUE_SLEEP); + deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); prev->on_rq = 0; if (prev->in_iowait) { @@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt) switch_count = &prev->nvcsw; } - if (task_on_rq_queued(prev)) - update_rq_clock(rq); - next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); @@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt) rq = context_switch(rq, prev, next, &rf); } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unpin_lock(rq, &rf); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } balance_callback(rq); @@ -3684,7 +3684,8 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; + int oldprio, queued, running, queue_flag = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class; struct rq_flags rf; struct rq *rq; @@ -3805,7 +3806,7 @@ void set_user_nice(struct task_struct *p, long nice) queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); if (running) put_prev_task(rq, p); @@ -3816,7 +3817,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (queued) { - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -4126,7 +4127,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_class *prev_class; struct rq_flags rf; int reset_on_fork; - int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; /* May grab non-irq protected spin_locks: */ @@ -4923,7 +4924,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, */ SYSCALL_DEFINE0(sched_yield) { - struct rq *rq = this_rq_lock(); + struct rq_flags rf; + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, &rf); schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); @@ -4932,9 +4938,8 @@ SYSCALL_DEFINE0(sched_yield) * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - do_raw_spin_unlock(&rq->lock); + preempt_disable(); + rq_unlock(rq, &rf); sched_preempt_enable_no_resched(); schedule(); @@ -5514,7 +5519,7 @@ void sched_setnuma(struct task_struct *p, int nid) p->numa_preferred_nid = nid; if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) set_curr_task(rq, p); task_rq_unlock(rq, p, &rf); @@ -5579,11 +5584,11 @@ static struct task_struct fake_task = { * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq) +static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; - struct rq_flags rf; + struct rq_flags orf = *rf; int dest_cpu; /* @@ -5602,9 +5607,7 @@ static void migrate_tasks(struct rq *dead_rq) * class method both need to have an up-to-date * value of rq->clock[_task] */ - rq_pin_lock(rq, &rf); update_rq_clock(rq); - rq_unpin_lock(rq, &rf); for (;;) { /* @@ -5617,8 +5620,7 @@ static void migrate_tasks(struct rq *dead_rq) /* * pick_next_task() assumes pinned rq->lock: */ - rq_repin_lock(rq, &rf); - next = pick_next_task(rq, &fake_task, &rf); + next = pick_next_task(rq, &fake_task, rf); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5631,10 +5633,9 @@ static void migrate_tasks(struct rq *dead_rq) * because !cpu_active at this point, which means load-balance * will not interfere. Also, stop-machine. */ - rq_unpin_lock(rq, &rf); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); raw_spin_lock(&next->pi_lock); - raw_spin_lock(&rq->lock); + rq_relock(rq, rf); /* * Since we're inside stop-machine, _nothing_ should have @@ -5648,12 +5649,12 @@ static void migrate_tasks(struct rq *dead_rq) /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); - - rq = __migrate_task(rq, next, dest_cpu); + rq = __migrate_task(rq, rf, next, dest_cpu); if (rq != dead_rq) { - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); rq = dead_rq; - raw_spin_lock(&rq->lock); + *rf = orf; + rq_relock(rq, rf); } raw_spin_unlock(&next->pi_lock); } @@ -5766,7 +5767,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) int sched_cpu_activate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; set_cpu_active(cpu, true); @@ -5784,12 +5785,12 @@ int sched_cpu_activate(unsigned int cpu) * 2) At runtime, if cpuset_cpu_active() fails to rebuild the * domains. */ - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); update_max_interval(); @@ -5847,18 +5848,20 @@ int sched_cpu_starting(unsigned int cpu) int sched_cpu_dying(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; /* Handle pending wakeups and then migrate everything off */ sched_ttwu_pending(); - raw_spin_lock_irqsave(&rq->lock, flags); + + rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq); + migrate_tasks(rq, &rf); BUG_ON(rq->nr_running != 1); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); + calc_load_migrate(rq); update_max_interval(); nohz_balance_exit_idle(cpu); @@ -6412,7 +6415,8 @@ static void sched_change_group(struct task_struct *tsk, int type) */ void sched_move_task(struct task_struct *tsk) { - int queued, running; + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq_flags rf; struct rq *rq; @@ -6423,14 +6427,14 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); + dequeue_task(rq, tsk, queue_flags); if (running) put_prev_task(rq, tsk); sched_change_group(tsk, TASK_MOVE_GROUP); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); + enqueue_task(rq, tsk, queue_flags); if (running) set_curr_task(rq, tsk); @@ -7008,14 +7012,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; + struct rq_flags rf; - raw_spin_lock_irq(&rq->lock); + rq_lock_irq(rq, &rf); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } if (runtime_was_enabled && !runtime_enabled) cfs_bandwidth_usage_dec(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dea138964b91..a903276fcb62 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -717,18 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP + +#include "sched-pelt.h" + static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); -/* - * We choose a half-life close to 1 scheduling period. - * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are - * dependent on this value. - */ -#define LOAD_AVG_PERIOD 32 -#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ - /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) { @@ -2733,47 +2727,15 @@ static inline void update_cfs_shares(struct sched_entity *se) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* Precomputed fixed inverse multiplies for multiplication by y^n */ -static const u32 runnable_avg_yN_inv[] = { - 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, - 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, - 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, - 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, - 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, - 0x85aac367, 0x82cd8698, -}; - -/* - * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent - * over-estimates when re-combining. - */ -static const u32 runnable_avg_yN_sum[] = { - 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, - 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, - 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, -}; - -/* - * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to - * lower integers. See Documentation/scheduler/sched-avg.txt how these - * were generated: - */ -static const u32 __accumulated_sum_N32[] = { - 0, 23371, 35056, 40899, 43820, 45281, - 46011, 46376, 46559, 46650, 46696, 46719, -}; - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ -static __always_inline u64 decay_load(u64 val, u64 n) +static u64 decay_load(u64 val, u64 n) { unsigned int local_n; - if (!n) - return val; - else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + if (unlikely(n > LOAD_AVG_PERIOD * 63)) return 0; /* after bounds checking we can collapse to 32-bit */ @@ -2795,30 +2757,97 @@ static __always_inline u64 decay_load(u64 val, u64 n) return val; } +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) +{ + u32 c1, c2, c3 = d3; /* y^0 == 1 */ + + /* + * c1 = d1 y^p + */ + c1 = decay_load((u64)d1, periods); + + /* + * p-1 + * c2 = 1024 \Sum y^n + * n=1 + * + * inf inf + * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) + * n=0 n=p + */ + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; + + return c1 + c2 + c3; +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* - * For updates fully spanning n periods, the contribution to runnable - * average will be: \Sum 1024*y^n + * Accumulate the three separate parts of the sum; d1 the remainder + * of the last (incomplete) period, d2 the span of full periods and d3 + * the remainder of the (incomplete) current period. + * + * d1 d2 d3 + * ^ ^ ^ + * | | | + * |<->|<----------------->|<--->| + * ... |---x---|------| ... |------|-----x (now) + * + * p-1 + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 + * n=1 * - * We can compute this reasonably efficiently by combining: - * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} + * = u y^p + (Step 1) + * + * p-1 + * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) + * n=1 */ -static u32 __compute_runnable_contrib(u64 n) +static __always_inline u32 +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u32 contrib = 0; + unsigned long scale_freq, scale_cpu; + u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ + u64 periods; - if (likely(n <= LOAD_AVG_PERIOD)) - return runnable_avg_yN_sum[n]; - else if (unlikely(n >= LOAD_AVG_MAX_N)) - return LOAD_AVG_MAX; + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ - contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; - n %= LOAD_AVG_PERIOD; - contrib = decay_load(contrib, n); - return contrib + runnable_avg_yN_sum[n]; -} + delta += sa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* + * Step 1: decay old *_sum if we crossed period boundaries. + */ + if (periods) { + sa->load_sum = decay_load(sa->load_sum, periods); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods); + } + sa->util_sum = decay_load((u64)(sa->util_sum), periods); + + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } + sa->period_contrib = delta; + + contrib = cap_scale(contrib, scale_freq); + if (weight) { + sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } + if (running) + sa->util_sum += contrib * scale_cpu; + + return periods; +} /* * We can represent the historical contribution to runnable average as the @@ -2849,13 +2878,10 @@ static u32 __compute_runnable_contrib(u64 n) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -__update_load_avg(u64 now, int cpu, struct sched_avg *sa, +___update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, scaled_delta, periods; - u32 contrib; - unsigned int delta_w, scaled_delta_w, decayed = 0; - unsigned long scale_freq, scale_cpu; + u64 delta; delta = now - sa->last_update_time; /* @@ -2874,83 +2900,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, delta >>= 10; if (!delta) return 0; - sa->last_update_time = now; - - scale_freq = arch_scale_freq_capacity(NULL, cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - - /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->period_contrib; - if (delta + delta_w >= 1024) { - decayed = 1; - /* how much left for next period will start over, we don't know yet */ - sa->period_contrib = 0; + sa->last_update_time += delta << 10; - /* - * Now that we know we're crossing a period boundary, figure - * out how much from delta we need to complete the current - * period and accrue it. - */ - delta_w = 1024 - delta_w; - scaled_delta_w = cap_scale(delta_w, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta_w; - if (cfs_rq) { - cfs_rq->runnable_load_sum += - weight * scaled_delta_w; - } - } - if (running) - sa->util_sum += scaled_delta_w * scale_cpu; - - delta -= delta_w; - - /* Figure out how many additional periods this update spans */ - periods = delta / 1024; - delta %= 1024; + /* + * Now we know we crossed measurement unit boundaries. The *_avg + * accrues by two steps: + * + * Step 1: accumulate *_sum since last_update_time. If we haven't + * crossed period boundaries, finish. + */ + if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + return 0; - sa->load_sum = decay_load(sa->load_sum, periods + 1); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods + 1); - } - sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); - - /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - contrib = __compute_runnable_contrib(periods); - contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } - if (running) - sa->util_sum += contrib * scale_cpu; + /* + * Step 2: update *_avg. + */ + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); } + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - /* Remainder of delta accrued against u_0` */ - scaled_delta = cap_scale(delta, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * scaled_delta; - } - if (running) - sa->util_sum += scaled_delta * scale_cpu; + return 1; +} - sa->period_contrib += delta; +static int +__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); +} - if (decayed) { - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); - } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - } +static int +__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); +} - return decayed; +static int +__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +{ + return ___update_load_avg(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + cfs_rq->curr != NULL, cfs_rq); } /* @@ -3014,6 +3009,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { + u64 p_last_update_time; + u64 n_last_update_time; + if (!sched_feat(ATTACH_AGE_LOAD)) return; @@ -3024,11 +3022,11 @@ void set_task_rq_fair(struct sched_entity *se, * time. This will result in the wakee task is less decayed, but giving * the wakee more load sounds not bad. */ - if (se->avg.last_update_time && prev) { - u64 p_last_update_time; - u64 n_last_update_time; + if (!(se->avg.last_update_time && prev)) + return; #ifndef CONFIG_64BIT + { u64 p_last_update_time_copy; u64 n_last_update_time_copy; @@ -3043,14 +3041,13 @@ void set_task_rq_fair(struct sched_entity *se, } while (p_last_update_time != p_last_update_time_copy || n_last_update_time != n_last_update_time_copy); + } #else - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; #endif - __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), - &se->avg, 0, 0, NULL); - se->avg.last_update_time = n_last_update_time; - } + __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); + se->avg.last_update_time = n_last_update_time; } /* Take into account change of utilization of a child task group */ @@ -3173,6 +3170,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) return 1; } +/* + * Check if we need to update the load and the utilization of a blocked + * group_entity: + */ +static inline bool skip_blocked_update(struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + + /* + * If sched_entity still have not zero load or utilization, we have to + * decay it: + */ + if (se->avg.load_avg || se->avg.util_avg) + return false; + + /* + * If there is a pending propagation, we have to update the load and + * the utilization of the sched_entity: + */ + if (gcfs_rq->propagate_avg) + return false; + + /* + * Otherwise, the load and the utilization of the sched_entity is + * already zero and there is no pending propagation, so it will be a + * waste of time to try to decay it: + */ + return true; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} @@ -3265,8 +3292,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) set_tg_cfs_propagate(cfs_rq); } - decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); + decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); @@ -3298,11 +3324,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags) * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { - __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) + __update_load_avg_se(now, cpu, cfs_rq, se); decayed = update_cfs_rq_load_avg(now, cfs_rq, true); decayed |= propagate_entity_load_avg(se); @@ -3407,7 +3430,7 @@ void sync_entity_load_avg(struct sched_entity *se) u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); } /* @@ -4271,8 +4294,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { struct rq *rq = rq_of(cfs_rq); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4289,7 +4313,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); if (!remaining) break; @@ -5097,15 +5121,16 @@ void cpu_load_update_nohz_stop(void) unsigned long curr_jiffies = READ_ONCE(jiffies); struct rq *this_rq = this_rq(); unsigned long load; + struct rq_flags rf; if (curr_jiffies == this_rq->last_load_update_tick) return; load = weighted_cpuload(cpu_of(this_rq)); - raw_spin_lock(&this_rq->lock); + rq_lock(this_rq, &rf); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); - raw_spin_unlock(&this_rq->lock); + rq_unlock(this_rq, &rf); } #else /* !CONFIG_NO_HZ_COMMON */ static inline void cpu_load_update_nohz(struct rq *this_rq, @@ -6769,7 +6794,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env) lockdep_assert_held(&env->src_rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; - deactivate_task(env->src_rq, p, 0); + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -6902,7 +6927,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -6913,9 +6938,12 @@ static void attach_task(struct rq *rq, struct task_struct *p) */ static void attach_one_task(struct rq *rq, struct task_struct *p) { - raw_spin_lock(&rq->lock); + struct rq_flags rf; + + rq_lock(rq, &rf); + update_rq_clock(rq); attach_task(rq, p); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -6926,8 +6954,10 @@ static void attach_tasks(struct lb_env *env) { struct list_head *tasks = &env->tasks; struct task_struct *p; + struct rq_flags rf; - raw_spin_lock(&env->dst_rq->lock); + rq_lock(env->dst_rq, &rf); + update_rq_clock(env->dst_rq); while (!list_empty(tasks)) { p = list_first_entry(tasks, struct task_struct, se.group_node); @@ -6936,7 +6966,7 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } - raw_spin_unlock(&env->dst_rq->lock); + rq_unlock(env->dst_rq, &rf); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6944,9 +6974,9 @@ static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); /* @@ -6954,6 +6984,8 @@ static void update_blocked_averages(int cpu) * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { + struct sched_entity *se; + /* throttled entities do not contribute to load */ if (throttled_hierarchy(cfs_rq)) continue; @@ -6961,11 +6993,12 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); - /* Propagate pending load changes to the parent */ - if (cfs_rq->tg->se[cpu]) - update_load_avg(cfs_rq->tg->se[cpu], 0); + /* Propagate pending load changes to the parent, if any: */ + se = cfs_rq->tg->se[cpu]; + if (se && !skip_blocked_update(se)) + update_load_avg(se, 0); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } /* @@ -7019,12 +7052,12 @@ static inline void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } static unsigned long task_h_load(struct task_struct *p) @@ -7525,6 +7558,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; + struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; bool overload = false; @@ -7541,7 +7575,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); if (local_group) { sds->local = sg; - sgs = &sds->local_stat; + sgs = local; if (env->idle != CPU_NEWLY_IDLE || time_after_eq(jiffies, sg->sgc->next_update)) @@ -7565,8 +7599,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * the tasks on the system). */ if (prefer_sibling && sds->local && - group_has_capacity(env, &sds->local_stat) && - (sgs->sum_nr_running > 1)) { + group_has_capacity(env, local) && + (sgs->sum_nr_running > local->sum_nr_running + 1)) { sgs->group_no_capacity = 1; sgs->group_type = group_classify(sg, sgs); } @@ -8042,7 +8076,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; - unsigned long flags; + struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { @@ -8105,7 +8139,7 @@ redo: env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: - raw_spin_lock_irqsave(&busiest->lock, flags); + rq_lock_irqsave(busiest, &rf); update_rq_clock(busiest); /* @@ -8122,14 +8156,14 @@ more_balance: * See task_rq_lock() family for the details. */ - raw_spin_unlock(&busiest->lock); + rq_unlock(busiest, &rf); if (cur_ld_moved) { attach_tasks(&env); ld_moved += cur_ld_moved; } - local_irq_restore(flags); + local_irq_restore(rf.flags); if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; @@ -8207,6 +8241,8 @@ more_balance: sd->nr_balance_failed++; if (need_active_balance(&env)) { + unsigned long flags; + raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -8444,8 +8480,9 @@ static int active_load_balance_cpu_stop(void *data) struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; struct task_struct *p = NULL; + struct rq_flags rf; - raw_spin_lock_irq(&busiest_rq->lock); + rq_lock_irq(busiest_rq, &rf); /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || @@ -8496,7 +8533,7 @@ static int active_load_balance_cpu_stop(void *data) rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; - raw_spin_unlock(&busiest_rq->lock); + rq_unlock(busiest_rq, &rf); if (p) attach_one_task(target_rq, p); @@ -8794,10 +8831,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * do the balance. */ if (time_after_eq(jiffies, rq->next_balance)) { - raw_spin_lock_irq(&rq->lock); + struct rq_flags rf; + + rq_lock_irq(rq, &rf); update_rq_clock(rq); cpu_load_update_idle(rq); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); + rebalance_domains(rq, CPU_IDLE); } @@ -8988,8 +9028,9 @@ static void task_fork_fair(struct task_struct *p) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; struct rq *rq = this_rq(); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); @@ -9010,7 +9051,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -9372,7 +9413,6 @@ static DEFINE_MUTEX(shares_mutex); int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; - unsigned long flags; /* * We can't change the weight of the root cgroup. @@ -9389,19 +9429,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) tg->shares = shares; for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - struct sched_entity *se; + struct sched_entity *se = tg->se[i]; + struct rq_flags rf; - se = tg->se[i]; /* Propagate contribution to hierarchy */ - raw_spin_lock_irqsave(&rq->lock, flags); - - /* Possible calls to update_curr() need rq clock */ + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); for_each_sched_entity(se) { update_load_avg(se, UPDATE_TG); update_cfs_shares(se); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } done: diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1b3c8189b286..11192e0cb122 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true) */ SCHED_FEAT(SIS_AVG_CPU, false) +/* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the + * annotations are not complete. + */ +SCHED_FEAT(WARN_DOUBLE_CLOCK, false) + #ifdef HAVE_RT_PUSH_IPI /* * In order to avoid a thundering herd attack of CPUs that are diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 9f3e40226dec..979b7341008a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq) #define RT_PUSH_IPI_EXECUTING 1 #define RT_PUSH_IPI_RESTART 2 +/* + * When a high priority task schedules out from a CPU and a lower priority + * task is scheduled in, a check is made to see if there's any RT tasks + * on other CPUs that are waiting to run because a higher priority RT task + * is currently running on its CPU. In this case, the CPU with multiple RT + * tasks queued on it (overloaded) needs to be notified that a CPU has opened + * up that may be able to run one of its non-running queued RT tasks. + * + * On large CPU boxes, there's the case that several CPUs could schedule + * a lower priority task at the same time, in which case it will look for + * any overloaded CPUs that it could pull a task from. To do this, the runqueue + * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting + * for a single overloaded CPU's runqueue lock can produce a large latency. + * (This has actually been observed on large boxes running cyclictest). + * Instead of taking the runqueue lock of the overloaded CPU, each of the + * CPUs that scheduled a lower priority task simply sends an IPI to the + * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with + * lots of contention. The overloaded CPU will look to push its non-running + * RT task off, and if it does, it can then ignore the other IPIs coming + * in, and just pass those IPIs off to any other overloaded CPU. + * + * When a CPU schedules a lower priority task, it only sends an IPI to + * the "next" CPU that has overloaded RT tasks. This prevents IPI storms, + * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with + * RT overloaded tasks, would cause 100 IPIs to go out at once. + * + * The overloaded RT CPU, when receiving an IPI, will try to push off its + * overloaded RT tasks and then send an IPI to the next CPU that has + * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks + * have completed. Just because a CPU may have pushed off its own overloaded + * RT task does not mean it should stop sending the IPI around to other + * overloaded CPUs. There may be another RT task waiting to run on one of + * those CPUs that are of higher priority than the one that was just + * pushed. + * + * An optimization that could possibly be made is to make a CPU array similar + * to the cpupri array mask of all running RT tasks, but for the overloaded + * case, then the IPI could be sent to only the CPU with the highest priority + * RT task waiting, and that CPU could send off further IPIs to the CPU with + * the next highest waiting task. Since the overloaded case is much less likely + * to happen, the complexity of this implementation may not be worth it. + * Instead, just send an IPI around to all overloaded CPUs. + * + * The rq->rt.push_flags holds the status of the IPI that is going around. + * A run queue can only send out a single IPI at a time. The possible flags + * for rq->rt.push_flags are: + * + * (None or zero): No IPI is going around for the current rq + * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around + * RT_PUSH_IPI_RESTART: The priority of the running task for the rq + * has changed, and the IPI should restart + * circulating the overloaded CPUs again. + * + * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated + * before sending to the next CPU. + * + * Instead of having all CPUs that schedule a lower priority task send + * an IPI to the same "first" CPU in the RT overload mask, they send it + * to the next overloaded CPU after their own CPU. This helps distribute + * the work when there's more than one overloaded CPU and multiple CPUs + * scheduling in lower priority tasks. + * + * When a rq schedules a lower priority task than what was currently + * running, the next CPU with overloaded RT tasks is examined first. + * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower + * priority task, it will send an IPI first to CPU 5, then CPU 5 will + * send to CPU 1 if it is still overloaded. CPU 1 will clear the + * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set. + * + * The first CPU to notice IPI_RESTART is set, will clear that flag and then + * send an IPI to the next overloaded CPU after the rq->cpu and not the next + * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3 + * schedules a lower priority task, and the IPI_RESTART gets set while the + * handling is being done on CPU 5, it will clear the flag and send it back to + * CPU 4 instead of CPU 1. + * + * Note, the above logic can be disabled by turning off the sched_feature + * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be + * taken by the CPU requesting a pull and the waiting RT task will be pulled + * by that CPU. This may be fine for machines with few CPUs. + */ static void tell_cpu_to_push(struct rq *rq) { int cpu; diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h new file mode 100644 index 000000000000..cd200d16529e --- /dev/null +++ b/kernel/sched/sched-pelt.h @@ -0,0 +1,13 @@ +/* Generated by Documentation/scheduler/sched-pelt; do not modify. */ + +static const u32 runnable_avg_yN_inv[] = { + 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, + 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, + 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, + 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, + 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, + 0x85aac367, 0x82cd8698, +}; + +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 767aab3505a8..7808ab050599 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 #define ENQUEUE_MOVE 0x04 +#define ENQUEUE_NOCLOCK 0x08 -#define ENQUEUE_HEAD 0x08 -#define ENQUEUE_REPLENISH 0x10 +#define ENQUEUE_HEAD 0x10 +#define ENQUEUE_REPLENISH 0x20 #ifdef CONFIG_SMP -#define ENQUEUE_MIGRATED 0x20 +#define ENQUEUE_MIGRATED 0x40 #else #define ENQUEUE_MIGRATED 0x00 #endif @@ -1624,6 +1626,7 @@ static inline void sched_avg_update(struct rq *rq) { } struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); + struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); @@ -1645,6 +1648,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT diff --git a/kernel/softirq.c b/kernel/softirq.c index 744fa611cae0..4e09821f9d9e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -309,7 +309,7 @@ restart: account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); WARN_ON_ONCE(in_interrupt()); - tsk_restore_flags(current, old_flags, PF_MEMALLOC); + current_restore_flags(old_flags, PF_MEMALLOC); } asmlinkage __visible void do_softirq(void) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bbf46da28e9a..c74bf39ef764 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4734,6 +4734,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); + +/** + * work_on_cpu_safe - run a function in thread context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function argument + * + * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold + * any locks which would prevent @fn from completing. + * + * Return: The value @fn returns. + */ +long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) +{ + long ret = -ENODEV; + + get_online_cpus(); + if (cpu_online(cpu)) + ret = work_on_cpu(cpu, fn, arg); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu_safe); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER diff --git a/net/core/dev.c b/net/core/dev.c index 9b5875388c23..c57878be09d2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4243,7 +4243,7 @@ static int __netif_receive_skb(struct sk_buff *skb) */ current->flags |= PF_MEMALLOC; ret = __netif_receive_skb_core(skb, true); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + current_restore_flags(pflags, PF_MEMALLOC); } else ret = __netif_receive_skb_core(skb, false); diff --git a/net/core/sock.c b/net/core/sock.c index 2c4f574168fb..b416a537bd0a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -325,7 +325,7 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) current->flags |= PF_MEMALLOC; ret = sk->sk_backlog_rcv(sk, skb); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + current_restore_flags(pflags, PF_MEMALLOC); return ret; } |