From f9f4872df6e1801572949f8a370c886122d4b6da Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sat, 8 Oct 2016 12:42:38 -0700 Subject: cpufreq: intel_pstate: Fix unsafe HWP MSR access This is a requirement that MSR MSR_PM_ENABLE must be set to 0x01 before reading MSR_HWP_CAPABILITIES on a given CPU. If cpufreq init() is scheduled on a CPU which is not same as policy->cpu or migrates to a different CPU before calling msr read for MSR_HWP_CAPABILITIES, it is possible that MSR_PM_ENABLE was not to set to 0x01 on that CPU. This will cause GP fault. So like other places in this path rdmsrl_on_cpu should be used instead of rdmsrl. Moreover the scope of MSR_HWP_CAPABILITIES is on per thread basis, so it should be read from the same CPU, for which MSR MSR_HWP_REQUEST is getting set. dmesg dump or warning: [ 22.014488] WARNING: CPU: 139 PID: 1 at arch/x86/mm/extable.c:50 ex_handler_rdmsr_unsafe+0x68/0x70 [ 22.014492] unchecked MSR access error: RDMSR from 0x771 [ 22.014493] Modules linked in: [ 22.014507] CPU: 139 PID: 1 Comm: swapper/0 Not tainted 4.7.5+ #1 ... ... [ 22.014516] Call Trace: [ 22.014542] [] dump_stack+0x63/0x82 [ 22.014558] [] __warn+0xcb/0xf0 [ 22.014561] [] warn_slowpath_fmt+0x4f/0x60 [ 22.014563] [] ex_handler_rdmsr_unsafe+0x68/0x70 [ 22.014564] [] fixup_exception+0x39/0x50 [ 22.014604] [] do_general_protection+0x80/0x150 [ 22.014610] [] general_protection+0x28/0x30 [ 22.014635] [] ? get_target_pstate_use_performance+0xb0/0xb0 [ 22.014642] [] ? native_read_msr+0x7/0x40 [ 22.014657] [] intel_pstate_hwp_set+0x23/0x130 [ 22.014660] [] intel_pstate_set_policy+0x1b6/0x340 [ 22.014662] [] cpufreq_set_policy+0xeb/0x2c0 [ 22.014664] [] cpufreq_init_policy+0x79/0xe0 [ 22.014666] [] ? cpufreq_update_policy+0x120/0x120 [ 22.014669] [] cpufreq_online+0x406/0x820 [ 22.014671] [] cpufreq_add_dev+0x5f/0x90 [ 22.014717] [] subsys_interface_register+0xb8/0x100 [ 22.014719] [] cpufreq_register_driver+0x14c/0x210 [ 22.014749] [] intel_pstate_init+0x39d/0x4d5 [ 22.014751] [] ? cpufreq_gov_dbs_init+0x12/0x12 Cc: 4.3+ # 4.3+ Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 806f2039571e..bc976a3db253 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -562,12 +562,12 @@ static void intel_pstate_hwp_set(const struct cpumask *cpumask) int min, hw_min, max, hw_max, cpu, range, adj_range; u64 value, cap; - rdmsrl(MSR_HWP_CAPABILITIES, cap); - hw_min = HWP_LOWEST_PERF(cap); - hw_max = HWP_HIGHEST_PERF(cap); - range = hw_max - hw_min; - for_each_cpu(cpu, cpumask) { + rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap); + hw_min = HWP_LOWEST_PERF(cap); + hw_max = HWP_HIGHEST_PERF(cap); + range = hw_max - hw_min; + rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); adj_range = limits->min_perf_pct * range / 100; min = hw_min + adj_range; -- cgit v1.2.3 From f00593a4bdd22e0885db89df8ee8afcc6867994f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 30 Sep 2016 03:13:34 +0200 Subject: cpufreq: intel_pstate: Clarify comment in get_target_pstate_use_performance() Make the comment explaining the meaning of the perf_scaled variable in get_target_pstate_use_performance() more straightforward. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index bc976a3db253..1c7b91c5805b 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1251,10 +1251,11 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) u64 duration_ns; /* - * perf_scaled is the average performance during the last sampling - * period scaled by the ratio of the maximum P-state to the P-state - * requested last time (in percent). That measures the system's - * response to the previous P-state selection. + * perf_scaled is the ratio of the average P-state during the last + * sampling period to the P-state requested last time (in percent). + * + * That measures the system's response to the previous P-state + * selection. */ max_pstate = cpu->pstate.max_pstate_physical; current_pstate = cpu->pstate.current_pstate; -- cgit v1.2.3 From 0843e83c1a4aa67e08f424430526c948d591d5f1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Oct 2016 14:07:51 +0200 Subject: cpufreq: intel_pstate: Proportional algorithm for Atom The PID algorithm used by the intel_pstate driver tends to drive performance to the minimum for workloads with utilization below the setpoint, which is undesirable, so replace it with a modified "proportional" algorithm on Atom. The new algorithm will set the new P-state to be 1.25 times the available maximum times the (frequency-invariant) utilization during the previous sampling period except when the target P-state computed this way is lower than the average P-state during the previous sampling period. In the latter case, it will increase the target by 50% of the difference between it and the average P-state to prevent performance from dropping down too fast in some cases. Signed-off-by: Rafael J. Wysocki Tested-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 1c7b91c5805b..6c8c897d0a2d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1232,6 +1232,7 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) { struct sample *sample = &cpu->sample; int32_t busy_frac, boost; + int target, avg_pstate; busy_frac = div_fp(sample->mperf, sample->tsc); @@ -1242,7 +1243,26 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) busy_frac = boost; sample->busy_scaled = busy_frac * 100; - return get_avg_pstate(cpu) - pid_calc(&cpu->pid, sample->busy_scaled); + + target = limits->no_turbo || limits->turbo_disabled ? + cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; + target += target >> 2; + target = mul_fp(target, busy_frac); + if (target < cpu->pstate.min_pstate) + target = cpu->pstate.min_pstate; + + /* + * If the average P-state during the previous cycle was higher than the + * current target, add 50% of the difference to the target to reduce + * possible performance oscillations and offset possible performance + * loss related to moving the workload from one CPU to another within + * a package/module. + */ + avg_pstate = get_avg_pstate(cpu); + if (avg_pstate > target) + target += (avg_pstate - target) >> 1; + + return target; } static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) -- cgit v1.2.3 From 3954517e2f083c8aeb52bfd467b7b2c164232ffc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Oct 2016 23:07:38 +0200 Subject: cpufreq: intel_pstate: Fix struct pstate_adjust_policy kerneldoc It looks like the name of struct pstate_adjust_policy was updated without updating its kerneldoc comment accordingly, so fix that mistake. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 6c8c897d0a2d..f535f8123258 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -225,7 +225,7 @@ struct cpudata { static struct cpudata **all_cpu_data; /** - * struct pid_adjust_policy - Stores static PID configuration data + * struct pstate_adjust_policy - Stores static PID configuration data * @sample_rate_ms: PID calculation sample rate in ms * @sample_rate_ns: Sample rate calculation in ns * @deadband: PID deadband -- cgit v1.2.3 From abb6627910a1e783c8e034b35b7c80e5e7f98f41 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Oct 2016 21:47:03 +0200 Subject: cpufreq: conservative: Fix next frequency selection Commit d352cf47d93e (cpufreq: conservative: Do not use transition notifications) overlooked the case when the "frequency step" used by the conservative governor is small relative to the distances between the available frequencies and broke the algorithm by using policy->cur instead of the previously requested frequency when computing the next one. As a result, the governor may not be able to go outside of a narrow range between two consecutive available frequencies. Fix the problem by making the governor save the previously requested frequency and select the next one relative that value (unless it is out of range, in which case policy->cur will be used instead). Fixes: d352cf47d93e (cpufreq: conservative: Do not use transition notifications) Link: https://bugzilla.kernel.org/show_bug.cgi?id=177171 Reported-and-tested-by: Aleksey Rybalkin Acked-by: Viresh Kumar Cc: 4.8+ # 4.8+ Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_conservative.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 18da4f8051d3..13475890d792 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -17,6 +17,7 @@ struct cs_policy_dbs_info { struct policy_dbs_info policy_dbs; unsigned int down_skip; + unsigned int requested_freq; }; static inline struct cs_policy_dbs_info *to_dbs_info(struct policy_dbs_info *policy_dbs) @@ -61,6 +62,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) { struct policy_dbs_info *policy_dbs = policy->governor_data; struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs); + unsigned int requested_freq = dbs_info->requested_freq; struct dbs_data *dbs_data = policy_dbs->dbs_data; struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; unsigned int load = dbs_update(policy); @@ -72,10 +74,16 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) if (cs_tuners->freq_step == 0) goto out; + /* + * If requested_freq is out of range, it is likely that the limits + * changed in the meantime, so fall back to current frequency in that + * case. + */ + if (requested_freq > policy->max || requested_freq < policy->min) + requested_freq = policy->cur; + /* Check for frequency increase */ if (load > dbs_data->up_threshold) { - unsigned int requested_freq = policy->cur; - dbs_info->down_skip = 0; /* if we are already at full speed then break out early */ @@ -83,8 +91,11 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) goto out; requested_freq += get_freq_target(cs_tuners, policy); + if (requested_freq > policy->max) + requested_freq = policy->max; __cpufreq_driver_target(policy, requested_freq, CPUFREQ_RELATION_H); + dbs_info->requested_freq = requested_freq; goto out; } @@ -95,7 +106,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) /* Check for frequency decrease */ if (load < cs_tuners->down_threshold) { - unsigned int freq_target, requested_freq = policy->cur; + unsigned int freq_target; /* * if we cannot reduce the frequency anymore, break out early */ @@ -109,6 +120,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) requested_freq = policy->min; __cpufreq_driver_target(policy, requested_freq, CPUFREQ_RELATION_L); + dbs_info->requested_freq = requested_freq; } out: @@ -287,6 +299,7 @@ static void cs_start(struct cpufreq_policy *policy) struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy->governor_data); dbs_info->down_skip = 0; + dbs_info->requested_freq = policy->cur; } static struct dbs_governor cs_governor = { -- cgit v1.2.3 From c197d758036d8c77923ae3f88571cf198283107e Mon Sep 17 00:00:00 2001 From: Hoan Tran Date: Thu, 13 Oct 2016 10:33:35 -0700 Subject: cpufreq: CPPC: Correct desired_perf calculation The desired_perf is an abstract performance number. Its value should be in the range of [lowest perf, highest perf] of CPPC. The correct calculation is desired_perf = freq * cppc_highest_perf / cppc_dmi_max_khz And cppc_cpufreq_set_target() returns if desired_perf is exactly the same with the old perf. Signed-off-by: Hoan Tran Reviewed-by: Prashanth Prakash Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cppc_cpufreq.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/cpufreq') diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 99db4227ae38..f7e98fc077c1 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -80,11 +80,17 @@ static int cppc_cpufreq_set_target(struct cpufreq_policy *policy, { struct cpudata *cpu; struct cpufreq_freqs freqs; + u32 desired_perf; int ret = 0; cpu = all_cpu_data[policy->cpu]; - cpu->perf_ctrls.desired_perf = (u64)target_freq * policy->max / cppc_dmi_max_khz; + desired_perf = (u64)target_freq * cpu->perf_caps.highest_perf / cppc_dmi_max_khz; + /* Return if it is exactly the same perf */ + if (desired_perf == cpu->perf_ctrls.desired_perf) + return ret; + + cpu->perf_ctrls.desired_perf = desired_perf; freqs.old = policy->cur; freqs.new = target_freq; -- cgit v1.2.3