diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 214 |
1 files changed, 119 insertions, 95 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index fde6ff903525..ccacdbdecf45 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,6 +75,9 @@ #include <asm/tlb.h> #include <asm/irq_regs.h> #include <asm/mutex.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif #include "sched_cpupri.h" #include "workqueue_sched.h" @@ -124,7 +127,7 @@ static inline int rt_policy(int policy) { - if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) + if (policy == SCHED_FIFO || policy == SCHED_RR) return 1; return 0; } @@ -422,6 +425,7 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; + atomic_t rto_count; struct rcu_head rcu; cpumask_var_t span; cpumask_var_t online; @@ -431,7 +435,6 @@ struct root_domain { * one runnable RT task. */ cpumask_var_t rto_mask; - atomic_t rto_count; struct cpupri cpupri; }; @@ -528,6 +531,12 @@ struct rq { #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif /* calc_load related fields */ unsigned long calc_load_update; @@ -581,7 +590,6 @@ static inline int cpu_of(struct rq *rq) #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ - rcu_read_lock_held() || \ lockdep_is_held(&sched_domains_mutex)) /* @@ -1568,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return rq->avg_load_per_task; } -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* - * Compute the cpu's hierarchical load factor for each task group. - * This needs to be done in a top-down fashion because the load of a child - * group is a fraction of its parents load. - */ -static int tg_load_down(struct task_group *tg, void *data) -{ - unsigned long load; - long cpu = (long)data; - - if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->load.weight; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; - } - - tg->cfs_rq[cpu]->h_load = load; - - return 0; -} - -static void update_h_load(long cpu) -{ - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); -} - -#endif - #ifdef CONFIG_PREEMPT static void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -1953,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr) } EXPORT_SYMBOL_GPL(account_system_vtime); -static void update_rq_clock_task(struct rq *rq, s64 delta) +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) { - s64 irq_delta; + if (unlikely(steal > NSEC_PER_SEC)) + return div_u64(steal, TICK_NSEC); + + return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ +/* + * In theory, the compile should just see 0 here, and optimize out the call + * to sched_rt_avg_update. But I don't trust it... + */ +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + s64 steal = 0, irq_delta = 0; +#endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; /* @@ -1979,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + if (static_branch((¶virt_steal_rq_enabled))) { + u64 st; + + steal = paravirt_steal_clock(cpu_of(rq)); + steal -= rq->prev_steal_time_rq; + + if (unlikely(steal > delta)) + steal = delta; + + st = steal_ticks(steal); + steal = st * TICK_NSEC; + + rq->prev_steal_time_rq += steal; + + delta -= steal; + } +#endif + rq->clock_task += delta; - if (irq_delta && sched_feat(NONIRQ_POWER)) - sched_rt_avg_update(rq, irq_delta); +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) + sched_rt_avg_update(rq, irq_delta + steal); +#endif } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING static int irqtime_account_hi_update(void) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; @@ -2019,12 +2036,7 @@ static int irqtime_account_si_update(void) #define sched_clock_irqtime (0) -static void update_rq_clock_task(struct rq *rq, s64 delta) -{ - rq->clock_task += delta; -} - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif #include "sched_idletask.c" #include "sched_fair.c" @@ -2220,7 +2232,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); } __set_task_cpu(p, new_cpu); @@ -2497,7 +2509,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); - if (unlikely(rq->idle_stamp)) { + if (rq->idle_stamp) { u64 delta = rq->clock - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; @@ -2886,7 +2898,7 @@ void sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif @@ -3877,6 +3889,25 @@ void account_idle_time(cputime_t cputime) cpustat->idle = cputime64_add(cpustat->idle, cputime64); } +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT + if (static_branch(¶virt_steal_enabled)) { + u64 steal, st = 0; + + steal = paravirt_steal_clock(smp_processor_id()); + steal -= this_rq()->prev_steal_time; + + st = steal_ticks(steal); + this_rq()->prev_steal_time += st * TICK_NSEC; + + account_steal_time(st); + return st; + } +#endif + return false; +} + #ifndef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -3908,6 +3939,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + if (steal_account_process_tick()) + return; + if (irqtime_account_hi_update()) { cpustat->irq = cputime64_add(cpustat->irq, tmp); } else if (irqtime_account_si_update()) { @@ -3961,6 +3995,9 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } + if (steal_account_process_tick()) + return; + if (user_tick) account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) @@ -4338,11 +4375,8 @@ EXPORT_SYMBOL(schedule); static inline bool owner_running(struct mutex *lock, struct task_struct *owner) { - bool ret = false; - - rcu_read_lock(); if (lock->owner != owner) - goto fail; + return false; /* * Ensure we emit the owner->on_cpu, dereference _after_ checking @@ -4352,11 +4386,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner) */ barrier(); - ret = owner->on_cpu; -fail: - rcu_read_unlock(); - - return ret; + return owner->on_cpu; } /* @@ -4368,21 +4398,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) if (!sched_feat(OWNER_SPIN)) return 0; + rcu_read_lock(); while (owner_running(lock, owner)) { if (need_resched()) - return 0; + break; arch_mutex_cpu_relax(); } + rcu_read_unlock(); /* - * If the owner changed to another task there is likely - * heavy contention, stop spinning. + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. */ - if (lock->owner) - return 0; - - return 1; + return lock->owner == NULL; } #endif @@ -7898,17 +7928,10 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +static void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT; INIT_LIST_HEAD(&cfs_rq->tasks); -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->rq = rq; - /* allow initial update_cfs_load() to truncate */ -#ifdef CONFIG_SMP - cfs_rq->load_stamp = 1; -#endif -#endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; @@ -7928,27 +7951,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +#if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO; -#ifdef CONFIG_SMP rt_rq->highest_prio.next = MAX_RT_PRIO; -#endif -#endif -#ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; - plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); + plist_head_init(&rt_rq->pushable_tasks); #endif rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; raw_spin_lock_init(&rt_rq->rt_runtime_lock); - -#ifdef CONFIG_RT_GROUP_SCHED - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; -#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -7957,11 +7971,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *parent) { struct rq *rq = cpu_rq(cpu); - tg->cfs_rq[cpu] = cfs_rq; - init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + cfs_rq->rq = rq; +#ifdef CONFIG_SMP + /* allow initial update_cfs_load() to truncate */ + cfs_rq->load_stamp = 1; +#endif + tg->cfs_rq[cpu] = cfs_rq; tg->se[cpu] = se; + /* se could be NULL for root_task_group */ if (!se) return; @@ -7984,12 +8004,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, { struct rq *rq = cpu_rq(cpu); - tg->rt_rq[cpu] = rt_rq; - init_rt_rq(rt_rq, rq); + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; rt_rq->tg = tg; - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + tg->rt_rq[cpu] = rt_rq; tg->rt_se[cpu] = rt_se; + if (!rt_se) return; @@ -8071,7 +8093,7 @@ void __init sched_init(void) rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; - init_cfs_rq(&rq->cfs, rq); + init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = root_task_group_load; @@ -8142,7 +8164,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_RT_MUTEXES - plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); + plist_head_init(&init_task.pi_waiters); #endif /* @@ -8185,7 +8207,7 @@ void __init sched_init(void) scheduler_running = 1; } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); @@ -8195,7 +8217,6 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { -#ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || @@ -8217,7 +8238,6 @@ void __might_sleep(const char *file, int line, int preempt_offset) if (irqs_disabled()) print_irqtrace_events(current); dump_stack(); -#endif } EXPORT_SYMBOL(__might_sleep); #endif @@ -8376,6 +8396,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!se) goto err_free_rq; + init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); } @@ -8403,7 +8424,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); raw_spin_unlock_irqrestore(&rq->lock, flags); } -#else /* !CONFG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) { } @@ -8424,7 +8445,8 @@ static void free_rt_sched_group(struct task_group *tg) { int i; - destroy_rt_bandwidth(&tg->rt_bandwidth); + if (tg->rt_se) + destroy_rt_bandwidth(&tg->rt_bandwidth); for_each_possible_cpu(i) { if (tg->rt_rq) @@ -8465,6 +8487,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; + init_rt_rq(rt_rq, cpu_rq(i)); + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } |