summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/entry/entry_32.S4
-rw-r--r--arch/x86/entry/entry_64.S2
-rw-r--r--arch/x86/events/intel/bts.c6
-rw-r--r--arch/x86/events/intel/uncore.c12
-rw-r--r--arch/x86/hyperv/hv_init.c5
-rw-r--r--arch/x86/hyperv/mmu.c57
-rw-r--r--arch/x86/include/asm/alternative-asm.h4
-rw-r--r--arch/x86/include/asm/alternative.h6
-rw-r--r--arch/x86/include/asm/mce.h1
-rw-r--r--arch/x86/include/asm/mmu_context.h8
-rw-r--r--arch/x86/include/asm/mshyperv.h1
-rw-r--r--arch/x86/include/asm/tlbflush.h33
-rw-r--r--arch/x86/kernel/amd_nb.c41
-rw-r--r--arch/x86/kernel/apic/apic.c15
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h7
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c27
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c19
-rw-r--r--arch/x86/kernel/head32.c5
-rw-r--r--arch/x86/kernel/kprobes/common.h13
-rw-r--r--arch/x86/kernel/kprobes/core.c2
-rw-r--r--arch/x86/kernel/reboot.c4
-rw-r--r--arch/x86/kernel/unwind_frame.c38
-rw-r--r--arch/x86/kernel/unwind_orc.c29
-rw-r--r--arch/x86/kvm/mmu.c15
-rw-r--r--arch/x86/kvm/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/vmx.c2
-rw-r--r--arch/x86/mm/Makefile11
-rw-r--r--arch/x86/mm/tlb.c105
-rw-r--r--arch/x86/xen/enlighten.c4
31 files changed, 357 insertions, 125 deletions
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 8a13d468635a..50e0d2bc4528 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -176,7 +176,7 @@
/*
* This is a sneaky trick to help the unwinder find pt_regs on the stack. The
* frame pointer is replaced with an encoded pointer to pt_regs. The encoding
- * is just setting the LSB, which makes it an invalid stack address and is also
+ * is just clearing the MSB, which makes it an invalid stack address and is also
* a signal to the unwinder that it's a pt_regs pointer in disguise.
*
* NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
@@ -185,7 +185,7 @@
.macro ENCODE_FRAME_POINTER
#ifdef CONFIG_FRAME_POINTER
mov %esp, %ebp
- orl $0x1, %ebp
+ andl $0x7fffffff, %ebp
#endif
.endm
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 49167258d587..f6cdb7a1455e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -808,7 +808,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
- UNWIND_HINT_IRET_REGS offset=8
+ UNWIND_HINT_IRET_REGS offset=\has_error_code*8
/* Sanity check */
.if \shift_ist != -1 && \paranoid == 0
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 16076eb34699..141e07b06216 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -546,9 +546,6 @@ static int bts_event_init(struct perf_event *event)
if (event->attr.type != bts_pmu.type)
return -ENOENT;
- if (x86_add_exclusive(x86_lbr_exclusive_bts))
- return -EBUSY;
-
/*
* BTS leaks kernel addresses even when CPL0 tracing is
* disabled, so disallow intel_bts driver for unprivileged
@@ -562,6 +559,9 @@ static int bts_event_init(struct perf_event *event)
!capable(CAP_SYS_ADMIN))
return -EACCES;
+ if (x86_add_exclusive(x86_lbr_exclusive_bts))
+ return -EBUSY;
+
ret = x86_reserve_hardware();
if (ret) {
x86_del_exclusive(x86_lbr_exclusive_bts);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 1c5390f1cf09..d45e06346f14 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -822,7 +822,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
pmus[i].type = type;
pmus[i].boxes = kzalloc(size, GFP_KERNEL);
if (!pmus[i].boxes)
- return -ENOMEM;
+ goto err;
}
type->pmus = pmus;
@@ -836,7 +836,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
sizeof(*attr_group), GFP_KERNEL);
if (!attr_group)
- return -ENOMEM;
+ goto err;
attrs = (struct attribute **)(attr_group + 1);
attr_group->name = "events";
@@ -849,7 +849,15 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
}
type->pmu_group = &uncore_pmu_attr_group;
+
return 0;
+
+err:
+ for (i = 0; i < type->num_boxes; i++)
+ kfree(pmus[i].boxes);
+ kfree(pmus);
+
+ return -ENOMEM;
}
static int __init
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 1a8eb550c40f..a5db63f728a2 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -85,6 +85,8 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
u32 *hv_vp_index;
EXPORT_SYMBOL_GPL(hv_vp_index);
+u32 hv_max_vp_index;
+
static int hv_cpu_init(unsigned int cpu)
{
u64 msr_vp_index;
@@ -93,6 +95,9 @@ static int hv_cpu_init(unsigned int cpu)
hv_vp_index[smp_processor_id()] = msr_vp_index;
+ if (msr_vp_index > hv_max_vp_index)
+ hv_max_vp_index = msr_vp_index;
+
return 0;
}
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 39e7f6e50919..9cc9e1c1e2db 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -36,9 +36,9 @@ struct hv_flush_pcpu_ex {
/* Each gva in gva_list encodes up to 4096 pages to flush */
#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
-static struct hv_flush_pcpu __percpu *pcpu_flush;
+static struct hv_flush_pcpu __percpu **pcpu_flush;
-static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex;
+static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex;
/*
* Fills in gva_list starting from offset. Returns the number of items added.
@@ -76,6 +76,18 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
{
int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
+ /* valid_bank_mask can represent up to 64 banks */
+ if (hv_max_vp_index / 64 >= 64)
+ return 0;
+
+ /*
+ * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex
+ * structs are not cleared between calls, we risk flushing unneeded
+ * vCPUs otherwise.
+ */
+ for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++)
+ flush->hv_vp_set.bank_contents[vcpu_bank] = 0;
+
/*
* Some banks may end up being empty but this is acceptable.
*/
@@ -83,11 +95,6 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush,
vcpu = hv_cpu_number_to_vp_number(cpu);
vcpu_bank = vcpu / 64;
vcpu_offset = vcpu % 64;
-
- /* valid_bank_mask can represent up to 64 banks */
- if (vcpu_bank >= 64)
- return 0;
-
__set_bit(vcpu_offset, (unsigned long *)
&flush->hv_vp_set.bank_contents[vcpu_bank]);
if (vcpu_bank >= nr_bank)
@@ -102,6 +109,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
int cpu, vcpu, gva_n, max_gvas;
+ struct hv_flush_pcpu **flush_pcpu;
struct hv_flush_pcpu *flush;
u64 status = U64_MAX;
unsigned long flags;
@@ -116,7 +124,17 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
local_irq_save(flags);
- flush = this_cpu_ptr(pcpu_flush);
+ flush_pcpu = this_cpu_ptr(pcpu_flush);
+
+ if (unlikely(!*flush_pcpu))
+ *flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+
+ flush = *flush_pcpu;
+
+ if (unlikely(!flush)) {
+ local_irq_restore(flags);
+ goto do_native;
+ }
if (info->mm) {
flush->address_space = virt_to_phys(info->mm->pgd);
@@ -173,6 +191,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
int nr_bank = 0, max_gvas, gva_n;
+ struct hv_flush_pcpu_ex **flush_pcpu;
struct hv_flush_pcpu_ex *flush;
u64 status = U64_MAX;
unsigned long flags;
@@ -187,7 +206,17 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
local_irq_save(flags);
- flush = this_cpu_ptr(pcpu_flush_ex);
+ flush_pcpu = this_cpu_ptr(pcpu_flush_ex);
+
+ if (unlikely(!*flush_pcpu))
+ *flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+
+ flush = *flush_pcpu;
+
+ if (unlikely(!flush)) {
+ local_irq_restore(flags);
+ goto do_native;
+ }
if (info->mm) {
flush->address_space = virt_to_phys(info->mm->pgd);
@@ -222,18 +251,18 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
status = hv_do_rep_hypercall(
HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
- 0, nr_bank + 2, flush, NULL);
+ 0, nr_bank, flush, NULL);
} else if (info->end &&
((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
status = hv_do_rep_hypercall(
HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
- 0, nr_bank + 2, flush, NULL);
+ 0, nr_bank, flush, NULL);
} else {
gva_n = fill_gva_list(flush->gva_list, nr_bank,
info->start, info->end);
status = hv_do_rep_hypercall(
HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
- gva_n, nr_bank + 2, flush, NULL);
+ gva_n, nr_bank, flush, NULL);
}
local_irq_restore(flags);
@@ -266,7 +295,7 @@ void hyper_alloc_mmu(void)
return;
if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
- pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+ pcpu_flush = alloc_percpu(struct hv_flush_pcpu *);
else
- pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+ pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *);
}
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index e7636bac7372..6c98821fef5e 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -62,8 +62,10 @@
#define new_len2 145f-144f
/*
- * max without conditionals. Idea adapted from:
+ * gas compatible max based on the idea from:
* http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas uses a "true" value of -1.
*/
#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index c096624137ae..ccbe24e697c4 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
alt_end_marker ":\n"
/*
- * max without conditionals. Idea adapted from:
+ * gas compatible max based on the idea from:
* http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
*
- * The additional "-" is needed because gas works with s32s.
+ * The additional "-" is needed because gas uses a "true" value of -1.
*/
-#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))"
+#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))"
/*
* Pad the second replacement alternative with additional NOPs if it is
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 181264989db5..8edac1de2e35 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -187,7 +187,6 @@ struct mca_msr_regs {
extern struct mce_vendor_flags mce_flags;
-extern struct mca_config mca_cfg;
extern struct mca_msr_regs msr_ops;
enum mce_notifier_prios {
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index c120b5db178a..3c856a15b98e 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
DEBUG_LOCKS_WARN_ON(preemptible());
}
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
- int cpu = smp_processor_id();
-
- if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
- cpumask_clear_cpu(cpu, mm_cpumask(mm));
-}
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 738503e1f80c..530f448fddaf 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -289,6 +289,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
* to this information.
*/
extern u32 *hv_vp_index;
+extern u32 hv_max_vp_index;
/**
* hv_cpu_number_to_vp_number() - Map CPU to VP.
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4893abf7f74f..c4aed0de565e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -82,6 +82,22 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif
+static inline bool tlb_defer_switch_to_init_mm(void)
+{
+ /*
+ * If we have PCID, then switching to init_mm is reasonably
+ * fast. If we don't have PCID, then switching to init_mm is
+ * quite slow, so we try to defer it in the hopes that we can
+ * avoid it entirely. The latter approach runs the risk of
+ * receiving otherwise unnecessary IPIs.
+ *
+ * This choice is just a heuristic. The tlb code can handle this
+ * function returning true or false regardless of whether we have
+ * PCID.
+ */
+ return !static_cpu_has(X86_FEATURE_PCID);
+}
+
/*
* 6 because 6 should be plenty and struct tlb_state will fit in
* two cache lines.
@@ -105,6 +121,23 @@ struct tlb_state {
u16 next_asid;
/*
+ * We can be in one of several states:
+ *
+ * - Actively using an mm. Our CPU's bit will be set in
+ * mm_cpumask(loaded_mm) and is_lazy == false;
+ *
+ * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit
+ * will not be set in mm_cpumask(&init_mm) and is_lazy == false.
+ *
+ * - Lazily using a real mm. loaded_mm != &init_mm, our bit
+ * is set in mm_cpumask(loaded_mm), but is_lazy == true.
+ * We're heuristically guessing that the CR3 load we
+ * skipped more than makes up for the overhead added by
+ * lazy mode.
+ */
+ bool is_lazy;
+
+ /*
* Access to this CR4 shadow and to H/W CR4 is protected by
* disabling interrupts when modifying either one.
*/
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 458da8509b75..6db28f17ff28 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -27,6 +27,8 @@ static const struct pci_device_id amd_root_ids[] = {
{}
};
+#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704
+
const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
@@ -37,6 +39,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{}
};
EXPORT_SYMBOL_GPL(amd_nb_misc_ids);
@@ -48,6 +51,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{}
};
@@ -402,11 +406,48 @@ void amd_flush_garts(void)
}
EXPORT_SYMBOL_GPL(amd_flush_garts);
+static void __fix_erratum_688(void *info)
+{
+#define MSR_AMD64_IC_CFG 0xC0011021
+
+ msr_set_bit(MSR_AMD64_IC_CFG, 3);
+ msr_set_bit(MSR_AMD64_IC_CFG, 14);
+}
+
+/* Apply erratum 688 fix so machines without a BIOS fix work. */
+static __init void fix_erratum_688(void)
+{
+ struct pci_dev *F4;
+ u32 val;
+
+ if (boot_cpu_data.x86 != 0x14)
+ return;
+
+ if (!amd_northbridges.num)
+ return;
+
+ F4 = node_to_amd_nb(0)->link;
+ if (!F4)
+ return;
+
+ if (pci_read_config_dword(F4, 0x164, &val))
+ return;
+
+ if (val & BIT(2))
+ return;
+
+ on_each_cpu(__fix_erratum_688, NULL, 0);
+
+ pr_info("x86/cpu/AMD: CPU erratum 688 worked around\n");
+}
+
static __init int init_amd_nbs(void)
{
amd_cache_northbridges();
amd_cache_gart();
+ fix_erratum_688();
+
return 0;
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d705c769f77d..ff891772c9f8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -573,11 +573,21 @@ static u32 bdx_deadline_rev(void)
return ~0U;
}
+static u32 skx_deadline_rev(void)
+{
+ switch (boot_cpu_data.x86_mask) {
+ case 0x03: return 0x01000136;
+ case 0x04: return 0x02000014;
+ }
+
+ return ~0U;
+}
+
static const struct x86_cpu_id deadline_match[] = {
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev),
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020),
DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev),
- DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014),
+ DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev),
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22),
DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20),
@@ -600,7 +610,8 @@ static void apic_check_deadline_errata(void)
const struct x86_cpu_id *m;
u32 rev;
- if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
+ boot_cpu_has(X86_FEATURE_HYPERVISOR))
return;
m = x86_match_cpu(deadline_match);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 24f749324c0f..9990a71e311f 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -831,7 +831,6 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
} else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
unsigned int apicid, nshared, first, last;
- this_leaf = this_cpu_ci->info_list + index;
nshared = base->eax.split.num_threads_sharing + 1;
apicid = cpu_data(cpu).apicid;
first = apicid - (apicid % nshared);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 098530a93bb7..debb974fd17d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,3 +1,6 @@
+#ifndef __X86_MCE_INTERNAL_H__
+#define __X86_MCE_INTERNAL_H__
+
#include <linux/device.h>
#include <asm/mce.h>
@@ -108,3 +111,7 @@ static inline void mce_work_trigger(void) { }
static inline void mce_register_injector_chain(struct notifier_block *nb) { }
static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
#endif
+
+extern struct mca_config mca_cfg;
+
+#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 40e28ed77fbf..486f640b02ef 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -28,6 +28,8 @@
#include <asm/msr.h>
#include <asm/trace/irq_vectors.h>
+#include "mce-internal.h"
+
#define NR_BLOCKS 5
#define THRESHOLD_MAX 0xFFF
#define INT_TYPE_APIC 0x00020000
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 86e8f0b2537b..c4fa4a85d4cb 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -122,9 +122,6 @@ static bool __init check_loader_disabled_bsp(void)
bool *res = &dis_ucode_ldr;
#endif
- if (!have_cpuid_p())
- return *res;
-
/*
* CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
* completely accurate as xen pv guests don't see that CPUID bit set but
@@ -166,24 +163,36 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name)
void __init load_ucode_bsp(void)
{
unsigned int cpuid_1_eax;
+ bool intel = true;
- if (check_loader_disabled_bsp())
+ if (!have_cpuid_p())
return;
cpuid_1_eax = native_cpuid_eax(1);
switch (x86_cpuid_vendor()) {
case X86_VENDOR_INTEL:
- if (x86_family(cpuid_1_eax) >= 6)
- load_ucode_intel_bsp();
+ if (x86_family(cpuid_1_eax) < 6)
+ return;
break;
+
case X86_VENDOR_AMD:
- if (x86_family(cpuid_1_eax) >= 0x10)
- load_ucode_amd_bsp(cpuid_1_eax);
+ if (x86_family(cpuid_1_eax) < 0x10)
+ return;
+ intel = false;
break;
+
default:
- break;
+ return;
}
+
+ if (check_loader_disabled_bsp())
+ return;
+
+ if (intel)
+ load_ucode_intel_bsp();
+ else
+ load_ucode_amd_bsp(cpuid_1_eax);
}
static bool check_loader_disabled_ap(void)
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 8f7a9bbad514..7dbcb7adf797 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -34,6 +34,7 @@
#include <linux/mm.h>
#include <asm/microcode_intel.h>
+#include <asm/intel-family.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
@@ -918,6 +919,18 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
return 0;
}
+static bool is_blacklisted(unsigned int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) {
+ pr_err_once("late loading on model 79 is disabled.\n");
+ return true;
+ }
+
+ return false;
+}
+
static enum ucode_state request_microcode_fw(int cpu, struct device *device,
bool refresh_fw)
{
@@ -926,6 +939,9 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
const struct firmware *firmware;
enum ucode_state ret;
+ if (is_blacklisted(cpu))
+ return UCODE_NFOUND;
+
sprintf(name, "intel-ucode/%02x-%02x-%02x",
c->x86, c->x86_model, c->x86_mask);
@@ -950,6 +966,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
static enum ucode_state
request_microcode_user(int cpu, const void __user *buf, size_t size)
{
+ if (is_blacklisted(cpu))
+ return UCODE_NFOUND;
+
return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index cf2ce063f65a..2902ca4d5993 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -30,10 +30,11 @@ static void __init i386_default_early_setup(void)
asmlinkage __visible void __init i386_start_kernel(void)
{
- cr4_init_shadow();
-
+ /* Make sure IDT is set up before any exception happens */
idt_setup_early_handler();
+ cr4_init_shadow();
+
sanitize_boot_params(&boot_params);
x86_early_init_platform_quirks();
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index db2182d63ed0..3fc0f9a794cb 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -3,6 +3,15 @@
/* Kprobes and Optprobes common header */
+#include <asm/asm.h>
+
+#ifdef CONFIG_FRAME_POINTER
+# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \
+ " mov %" _ASM_SP ", %" _ASM_BP "\n"
+#else
+# define SAVE_RBP_STRING " push %" _ASM_BP "\n"
+#endif
+
#ifdef CONFIG_X86_64
#define SAVE_REGS_STRING \
/* Skip cs, ip, orig_ax. */ \
@@ -17,7 +26,7 @@
" pushq %r10\n" \
" pushq %r11\n" \
" pushq %rbx\n" \
- " pushq %rbp\n" \
+ SAVE_RBP_STRING \
" pushq %r12\n" \
" pushq %r13\n" \
" pushq %r14\n" \
@@ -48,7 +57,7 @@
" pushl %es\n" \
" pushl %ds\n" \
" pushl %eax\n" \
- " pushl %ebp\n" \
+ SAVE_RBP_STRING \
" pushl %edi\n" \
" pushl %esi\n" \
" pushl %edx\n" \
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index f0153714ddac..0742491cbb73 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1080,8 +1080,6 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
* raw stack chunk with redzones:
*/
__memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr));
- regs->flags &= ~X86_EFLAGS_IF;
- trace_hardirqs_off();
regs->ip = (unsigned long)(jp->entry);
/*
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 54180fa6f66f..add33f600531 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -105,6 +105,10 @@ void __noreturn machine_real_restart(unsigned int type)
load_cr3(initial_page_table);
#else
write_cr3(real_mode_header->trampoline_pgd);
+
+ /* Exiting long mode will fail if CR4.PCIDE is set. */
+ if (static_cpu_has(X86_FEATURE_PCID))
+ cr4_clear_bits(X86_CR4_PCIDE);
#endif
/* Jump to the identity-mapped low memory code */
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index d145a0b1f529..3dc26f95d46e 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -44,7 +44,8 @@ static void unwind_dump(struct unwind_state *state)
state->stack_info.type, state->stack_info.next_sp,
state->stack_mask, state->graph_idx);
- for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp;
+ sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
break;
@@ -174,6 +175,7 @@ static bool is_last_task_frame(struct unwind_state *state)
* This determines if the frame pointer actually contains an encoded pointer to
* pt_regs on the stack. See ENCODE_FRAME_POINTER.
*/
+#ifdef CONFIG_X86_64
static struct pt_regs *decode_frame_pointer(unsigned long *bp)
{
unsigned long regs = (unsigned long)bp;
@@ -183,6 +185,23 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp)
return (struct pt_regs *)(regs & ~0x1);
}
+#else
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (regs & 0x80000000)
+ return NULL;
+
+ return (struct pt_regs *)(regs | 0x80000000);
+}
+#endif
+
+#ifdef CONFIG_X86_32
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long))
+#else
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs))
+#endif
static bool update_stack_state(struct unwind_state *state,
unsigned long *next_bp)
@@ -202,7 +221,7 @@ static bool update_stack_state(struct unwind_state *state,
regs = decode_frame_pointer(next_bp);
if (regs) {
frame = (unsigned long *)regs;
- len = regs_size(regs);
+ len = KERNEL_REGS_SIZE;
state->got_irq = true;
} else {
frame = next_bp;
@@ -226,6 +245,14 @@ static bool update_stack_state(struct unwind_state *state,
frame < prev_frame_end)
return false;
+ /*
+ * On 32-bit with user mode regs, make sure the last two regs are safe
+ * to access:
+ */
+ if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) &&
+ !on_stack(info, frame, len + 2*sizeof(long)))
+ return false;
+
/* Move state to the next frame: */
if (regs) {
state->regs = regs;
@@ -328,6 +355,13 @@ bad_address:
state->regs->sp < (unsigned long)task_pt_regs(state->task))
goto the_end;
+ /*
+ * There are some known frame pointer issues on 32-bit. Disable
+ * unwinder warnings on 32-bit until it gets objtool support.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ goto the_end;
+
if (state->regs) {
printk_deferred_once(KERN_WARNING
"WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 570b70d3f604..b95007e7c1b3 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -86,8 +86,8 @@ static struct orc_entry *orc_find(unsigned long ip)
idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;
if (unlikely((idx >= lookup_num_blocks-1))) {
- orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n",
- idx, lookup_num_blocks, ip);
+ orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%pB\n",
+ idx, lookup_num_blocks, (void *)ip);
return NULL;
}
@@ -96,8 +96,8 @@ static struct orc_entry *orc_find(unsigned long ip)
if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
(__start_orc_unwind + stop > __stop_orc_unwind))) {
- orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n",
- idx, lookup_num_blocks, start, stop, ip);
+ orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%pB\n",
+ idx, lookup_num_blocks, start, stop, (void *)ip);
return NULL;
}
@@ -373,7 +373,7 @@ bool unwind_next_frame(struct unwind_state *state)
case ORC_REG_R10:
if (!state->regs || !state->full_regs) {
- orc_warn("missing regs for base reg R10 at ip %p\n",
+ orc_warn("missing regs for base reg R10 at ip %pB\n",
(void *)state->ip);
goto done;
}
@@ -382,7 +382,7 @@ bool unwind_next_frame(struct unwind_state *state)
case ORC_REG_R13:
if (!state->regs || !state->full_regs) {
- orc_warn("missing regs for base reg R13 at ip %p\n",
+ orc_warn("missing regs for base reg R13 at ip %pB\n",
(void *)state->ip);
goto done;
}
@@ -391,7 +391,7 @@ bool unwind_next_frame(struct unwind_state *state)
case ORC_REG_DI:
if (!state->regs || !state->full_regs) {
- orc_warn("missing regs for base reg DI at ip %p\n",
+ orc_warn("missing regs for base reg DI at ip %pB\n",
(void *)state->ip);
goto done;
}
@@ -400,7 +400,7 @@ bool unwind_next_frame(struct unwind_state *state)
case ORC_REG_DX:
if (!state->regs || !state->full_regs) {
- orc_warn("missing regs for base reg DX at ip %p\n",
+ orc_warn("missing regs for base reg DX at ip %pB\n",
(void *)state->ip);
goto done;
}
@@ -408,7 +408,7 @@ bool unwind_next_frame(struct unwind_state *state)
break;
default:
- orc_warn("unknown SP base reg %d for ip %p\n",
+ orc_warn("unknown SP base reg %d for ip %pB\n",
orc->sp_reg, (void *)state->ip);
goto done;
}
@@ -436,7 +436,7 @@ bool unwind_next_frame(struct unwind_state *state)
case ORC_TYPE_REGS:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
- orc_warn("can't dereference registers at %p for ip %p\n",
+ orc_warn("can't dereference registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
}
@@ -448,7 +448,7 @@ bool unwind_next_frame(struct unwind_state *state)
case ORC_TYPE_REGS_IRET:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
- orc_warn("can't dereference iret registers at %p for ip %p\n",
+ orc_warn("can't dereference iret registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
}
@@ -465,7 +465,8 @@ bool unwind_next_frame(struct unwind_state *state)
break;
default:
- orc_warn("unknown .orc_unwind entry type %d\n", orc->type);
+ orc_warn("unknown .orc_unwind entry type %d for ip %pB\n",
+ orc->type, (void *)orig_ip);
break;
}
@@ -487,7 +488,7 @@ bool unwind_next_frame(struct unwind_state *state)
break;
default:
- orc_warn("unknown BP base reg %d for ip %p\n",
+ orc_warn("unknown BP base reg %d for ip %pB\n",
orc->bp_reg, (void *)orig_ip);
goto done;
}
@@ -496,7 +497,7 @@ bool unwind_next_frame(struct unwind_state *state)
if (state->stack_info.type == prev_type &&
on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
state->sp <= prev_sp) {
- orc_warn("stack going in the wrong direction? ip=%p\n",
+ orc_warn("stack going in the wrong direction? ip=%pB\n",
(void *)orig_ip);
goto done;
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 106d4a029a8a..7a69cf053711 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3974,19 +3974,19 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
unsigned level, unsigned gpte)
{
/*
- * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
- * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
- * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
- */
- gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
-
- /*
* The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
* If it is clear, there are no large pages at this level, so clear
* PT_PAGE_SIZE_MASK in gpte if that is the case.
*/
gpte &= level - mmu->last_nonleaf_level;
+ /*
+ * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
+ * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+ * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+
return gpte & PT_PAGE_SIZE_MASK;
}
@@ -4555,6 +4555,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
update_permission_bitmask(vcpu, context, true);
update_pkru_bitmask(vcpu, context, true);
+ update_last_nonleaf_level(vcpu, context);
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 86b68dc5a649..f18d1f8d332b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -334,10 +334,11 @@ retry_walk:
--walker->level;
index = PT_INDEX(addr, walker->level);
-
table_gfn = gpte_to_gfn(pte);
offset = index * sizeof(pt_element_t);
pte_gpa = gfn_to_gpa(table_gfn) + offset;
+
+ BUG_ON(walker->level < 1);
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a2b804e10c95..95a01609d7ee 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11297,7 +11297,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
/* Same as above - no reason to call set_cr4_guest_host_mask(). */
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
- kvm_set_cr4(vcpu, vmcs12->host_cr4);
+ vmx_set_cr4(vcpu, vmcs12->host_cr4);
nested_ept_uninit_mmu_context(vcpu);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 72bf8c01c6e3..e1f095884386 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,12 @@
-# Kernel does not boot with instrumentation of tlb.c.
-KCOV_INSTRUMENT_tlb.o := n
+# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c
+KCOV_INSTRUMENT_tlb.o := n
+KCOV_INSTRUMENT_mem_encrypt.o := n
+
+KASAN_SANITIZE_mem_encrypt.o := n
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_mem_encrypt.o = -pg
+endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o setup_nx.o tlb.o
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 49d9778376d7..0f3d0cea4d00 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,7 @@
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+
static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
u16 *new_asid, bool *need_flush)
{
@@ -80,7 +81,7 @@ void leave_mm(int cpu)
return;
/* Warn if we're not lazy. */
- WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
+ WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
switch_mm(NULL, &init_mm, NULL);
}
@@ -142,45 +143,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
__flush_tlb_all();
}
#endif
+ this_cpu_write(cpu_tlbstate.is_lazy, false);
if (real_prev == next) {
- VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
- next->context.ctx_id);
-
- if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
- /*
- * There's nothing to do: we weren't lazy, and we
- * aren't changing our mm. We don't need to flush
- * anything, nor do we need to update CR3, CR4, or
- * LDTR.
- */
- return;
- }
-
- /* Resume remote flushes and then read tlb_gen. */
- cpumask_set_cpu(cpu, mm_cpumask(next));
- next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-
- if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
- next_tlb_gen) {
- /*
- * Ideally, we'd have a flush_tlb() variant that
- * takes the known CR3 value as input. This would
- * be faster on Xen PV and on hypothetical CPUs
- * on which INVPCID is fast.
- */
- this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
- next_tlb_gen);
- write_cr3(build_cr3(next, prev_asid));
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
- TLB_FLUSH_ALL);
- }
+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ next->context.ctx_id);
/*
- * We just exited lazy mode, which means that CR4 and/or LDTR
- * may be stale. (Changes to the required CR4 and LDTR states
- * are not reflected in tlb_gen.)
+ * We don't currently support having a real mm loaded without
+ * our cpu set in mm_cpumask(). We have all the bookkeeping
+ * in place to figure out whether we would need to flush
+ * if our cpu were cleared in mm_cpumask(), but we don't
+ * currently use it.
*/
+ if (WARN_ON_ONCE(real_prev != &init_mm &&
+ !cpumask_test_cpu(cpu, mm_cpumask(next))))
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ return;
} else {
u16 new_asid;
bool need_flush;
@@ -199,10 +179,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
}
/* Stop remote flushes for the previous mm */
- if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
-
- VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+ real_prev != &init_mm);
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
/*
* Start remote flushes and then read tlb_gen.
@@ -233,6 +212,40 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
}
/*
+ * Please ignore the name of this function. It should be called
+ * switch_to_kernel_thread().
+ *
+ * enter_lazy_tlb() is a hint from the scheduler that we are entering a
+ * kernel thread or other context without an mm. Acceptable implementations
+ * include doing nothing whatsoever, switching to init_mm, or various clever
+ * lazy tricks to try to minimize TLB flushes.
+ *
+ * The scheduler reserves the right to call enter_lazy_tlb() several times
+ * in a row. It will notify us that we're going back to a real mm by
+ * calling switch_mm_irqs_off().
+ */
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
+ return;
+
+ if (tlb_defer_switch_to_init_mm()) {
+ /*
+ * There's a significant optimization that may be possible
+ * here. We have accurate enough TLB flush tracking that we
+ * don't need to maintain coherence of TLB per se when we're
+ * lazy. We do, however, need to maintain coherence of
+ * paging-structure caches. We could, in principle, leave our
+ * old mm loaded and only switch to init_mm when
+ * tlb_remove_page() happens.
+ */
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
+ } else {
+ switch_mm(NULL, &init_mm, NULL);
+ }
+}
+
+/*
* Call this when reinitializing a CPU. It fixes the following potential
* problems:
*
@@ -303,16 +316,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
+ if (unlikely(loaded_mm == &init_mm))
+ return;
+
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
- if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
+ if (this_cpu_read(cpu_tlbstate.is_lazy)) {
/*
- * We're in lazy mode -- don't flush. We can get here on
- * remote flushes due to races and on local flushes if a
- * kernel thread coincidentally flushes the mm it's lazily
- * still using.
+ * We're in lazy mode. We need to at least flush our
+ * paging-structure cache to avoid speculatively reading
+ * garbage into our TLB. Since switching to init_mm is barely
+ * slower than a minimal flush, just switch to init_mm.
*/
+ switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0e7ef69e8531..d669e9d89001 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -93,11 +93,11 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
int rc;
rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE,
- "x86/xen/hvm_guest:prepare",
+ "x86/xen/guest:prepare",
cpu_up_prepare_cb, cpu_dead_cb);
if (rc >= 0) {
rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
- "x86/xen/hvm_guest:online",
+ "x86/xen/guest:online",
xen_cpu_up_online, NULL);
if (rc < 0)
cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE);