diff options
Diffstat (limited to 'arch/x86')
38 files changed, 352 insertions, 249 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5c6e7471b732..c580d8c33562 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -80,6 +80,7 @@ config X86 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_AOUT if X86_32 select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP @@ -91,6 +92,7 @@ config X86 select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_EBPF_JIT if X86_64 select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index fe91c25092da..77f28ce9c646 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -5,6 +5,8 @@ OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y +CFLAGS_syscall_64.o += -Wno-override-init +CFLAGS_syscall_32.o += -Wno-override-init obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b846875aeea6..d172c619c449 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -288,11 +288,15 @@ return_from_SYSCALL_64: jne opportunistic_sysret_failed /* - * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, - * restoring TF results in a trap from userspace immediately after - * SYSRET. This would cause an infinite loop whenever #DB happens - * with register state that satisfies the opportunistic SYSRET - * conditions. For example, single-stepping this user code: + * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot + * restore RF properly. If the slowpath sets it for whatever reason, we + * need to restore it correctly. + * + * SYSRET can restore TF, but unlike IRET, restoring TF results in a + * trap from userspace immediately after SYSRET. This would cause an + * infinite loop whenever #DB happens with register state that satisfies + * the opportunistic SYSRET conditions. For example, single-stepping + * this user code: * * movq $stuck_here, %rcx * pushfq @@ -601,9 +605,20 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym) .endm #endif +/* Make sure APIC interrupt handlers end up in the irqentry section: */ +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) +# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" +# define POP_SECTION_IRQENTRY .popsection +#else +# define PUSH_SECTION_IRQENTRY +# define POP_SECTION_IRQENTRY +#endif + .macro apicinterrupt num sym do_sym +PUSH_SECTION_IRQENTRY apicinterrupt3 \num \sym \do_sym trace_apicinterrupt \num \sym +POP_SECTION_IRQENTRY .endm #ifdef CONFIG_SMP diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 97a69dbba649..9d35ec0cb8fc 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -100,6 +100,12 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box) } } +static void snb_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); +} + static void snb_uncore_msr_exit_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) @@ -127,6 +133,7 @@ static struct attribute_group snb_uncore_format_group = { static struct intel_uncore_ops snb_uncore_msr_ops = { .init_box = snb_uncore_msr_init_box, + .enable_box = snb_uncore_msr_enable_box, .exit_box = snb_uncore_msr_exit_box, .disable_event = snb_uncore_msr_disable_event, .enable_event = snb_uncore_msr_enable_event, @@ -192,6 +199,12 @@ static void skl_uncore_msr_init_box(struct intel_uncore_box *box) } } +static void skl_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL); +} + static void skl_uncore_msr_exit_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) @@ -200,6 +213,7 @@ static void skl_uncore_msr_exit_box(struct intel_uncore_box *box) static struct intel_uncore_ops skl_uncore_msr_ops = { .init_box = skl_uncore_msr_init_box, + .enable_box = skl_uncore_msr_enable_box, .exit_box = skl_uncore_msr_exit_box, .disable_event = snb_uncore_msr_disable_event, .enable_event = snb_uncore_msr_enable_event, diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 824e54086e07..8aee83bcf71f 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -2626,7 +2626,7 @@ void hswep_uncore_cpu_init(void) static struct intel_uncore_type hswep_uncore_ha = { .name = "ha", - .num_counters = 5, + .num_counters = 4, .num_boxes = 2, .perf_ctr_bits = 48, SNBEP_UNCORE_PCI_COMMON_INIT(), @@ -2645,7 +2645,7 @@ static struct uncore_event_desc hswep_uncore_imc_events[] = { static struct intel_uncore_type hswep_uncore_imc = { .name = "imc", - .num_counters = 5, + .num_counters = 4, .num_boxes = 8, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, @@ -2691,7 +2691,7 @@ static struct intel_uncore_type hswep_uncore_irp = { static struct intel_uncore_type hswep_uncore_qpi = { .name = "qpi", - .num_counters = 5, + .num_counters = 4, .num_boxes = 3, .perf_ctr_bits = 48, .perf_ctr = SNBEP_PCI_PMON_CTR0, @@ -2773,7 +2773,7 @@ static struct event_constraint hswep_uncore_r3qpi_constraints[] = { static struct intel_uncore_type hswep_uncore_r3qpi = { .name = "r3qpi", - .num_counters = 4, + .num_counters = 3, .num_boxes = 3, .perf_ctr_bits = 44, .constraints = hswep_uncore_r3qpi_constraints, @@ -2972,7 +2972,7 @@ static struct intel_uncore_type bdx_uncore_ha = { static struct intel_uncore_type bdx_uncore_imc = { .name = "imc", - .num_counters = 5, + .num_counters = 4, .num_boxes = 8, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f5befd4945f2..124357773ffa 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -135,6 +135,7 @@ extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); +extern void lapic_update_tsc_freq(void); extern int APIC_init_uniprocessor(void); #ifdef CONFIG_X86_64 @@ -170,6 +171,7 @@ static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop +static inline void lapic_update_tsc_freq(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 7178043b0e1d..59405a248fc2 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -22,10 +22,6 @@ typedef struct { #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; - /* - * irq_tlb_count is double-counted in irq_call_count, so it must be - * subtracted from irq_call_count when displaying irq_call_count - */ unsigned int irq_tlb_count; #endif #ifdef CONFIG_X86_THERMAL_VECTOR diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 223042086f4e..737da62bfeb0 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -5,10 +5,10 @@ struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void *context; /* context for alloc_pgt_page */ unsigned long pmd_flag; /* page flag for PMD entry */ - bool kernel_mapping; /* kernel mapping or ident mapping */ + unsigned long offset; /* ident mapping offset */ }; int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, - unsigned long addr, unsigned long end); + unsigned long pstart, unsigned long pend); #endif /* _ASM_X86_INIT_H */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 7e8ec7ae10fa..1cc82ece9ac1 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -145,7 +145,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names - * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry + * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -156,7 +156,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) #define SWP_TYPE_BITS 5 /* Place the offset above the type: */ -#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1) +#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 9c6b890d5e7a..b2988c0ed829 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -58,7 +58,15 @@ extern unsigned char boot_gdt[]; extern unsigned char secondary_startup_64[]; #endif +static inline size_t real_mode_size_needed(void) +{ + if (real_mode_header) + return 0; /* already allocated. */ + + return ALIGN(real_mode_blob_end - real_mode_blob, PAGE_SIZE); +} + +void set_real_mode_mem(phys_addr_t mem, size_t size); void reserve_real_mode(void); -void setup_real_mode(void); #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 84b59846154a..8b7c8d8e0852 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -176,6 +176,50 @@ static inline unsigned long current_stack_pointer(void) return sp; } +/* + * Walks up the stack frames to make sure that the specified object is + * entirely contained by a single stack frame. + * + * Returns: + * 1 if within a frame + * -1 if placed across a frame boundary (or outside stack) + * 0 unable to determine (no frame pointers, etc) + */ +static inline int arch_within_stack_frames(const void * const stack, + const void * const stackend, + const void *obj, unsigned long len) +{ +#if defined(CONFIG_FRAME_POINTER) + const void *frame = NULL; + const void *oldframe; + + oldframe = __builtin_frame_address(1); + if (oldframe) + frame = __builtin_frame_address(2); + /* + * low ----------------------------------------------> high + * [saved bp][saved ip][args][local vars][saved bp][saved ip] + * ^----------------^ + * allow copies only within here + */ + while (stack <= frame && frame < stackend) { + /* + * If obj + len extends past the last frame, this + * check won't pass and the next frame will be 0, + * causing us to bail out and correctly report + * the copy as invalid. + */ + if (obj + len <= frame) + return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1; + oldframe = frame; + frame = *(const void * const *)frame; + } + return -1; +#else + return 0; +#endif +} + #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4e5be94e079a..6fa85944af83 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -135,7 +135,14 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) static inline void __native_flush_tlb(void) { + /* + * If current->mm == NULL then we borrow a mm which may change during a + * task switch and therefore we must not be preempted while we write CR3 + * back: + */ + preempt_disable(); native_write_cr3(native_read_cr3()); + preempt_enable(); } static inline void __native_flush_tlb_global_irq_disabled(void) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index c03bfb68c503..a0ae610b9280 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -761,9 +761,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n) * case, and do only runtime checking for non-constant sizes. */ - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(to, n, false); n = _copy_from_user(to, from, n); - else if(__builtin_constant_p(n)) + } else if (__builtin_constant_p(n)) copy_from_user_overflow(); else __copy_from_user_overflow(sz, n); @@ -781,9 +782,10 @@ copy_to_user(void __user *to, const void *from, unsigned long n) might_fault(); /* See the comment in copy_from_user() above. */ - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(from, n, true); n = _copy_to_user(to, from, n); - else if(__builtin_constant_p(n)) + } else if (__builtin_constant_p(n)) copy_to_user_overflow(); else __copy_to_user_overflow(sz, n); @@ -812,21 +814,21 @@ copy_to_user(void __user *to, const void *from, unsigned long n) #define user_access_begin() __uaccess_begin() #define user_access_end() __uaccess_end() -#define unsafe_put_user(x, ptr) \ -({ \ +#define unsafe_put_user(x, ptr, err_label) \ +do { \ int __pu_err; \ __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ - __builtin_expect(__pu_err, 0); \ -}) + if (unlikely(__pu_err)) goto err_label; \ +} while (0) -#define unsafe_get_user(x, ptr) \ -({ \ +#define unsafe_get_user(x, ptr, err_label) \ +do { \ int __gu_err; \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __builtin_expect(__gu_err, 0); \ -}) + if (unlikely(__gu_err)) goto err_label; \ +} while (0) #endif /* _ASM_X86_UACCESS_H */ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 4b32da24faaf..7d3bdd1ed697 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -37,6 +37,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero static __always_inline unsigned long __must_check __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { + check_object_size(from, n, true); return __copy_to_user_ll(to, from, n); } @@ -95,6 +96,7 @@ static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); + check_object_size(to, n, false); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 2eac2aa3e37f..673059a109fe 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -54,6 +54,7 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) { int ret = 0; + check_object_size(dst, size, false); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -119,6 +120,7 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) { int ret = 0; + check_object_size(src, size, true); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index c852590254d5..e652a7cc6186 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -79,7 +79,7 @@ struct uv_gam_range_entry { u16 nasid; /* HNasid */ u16 sockid; /* Socket ID, high bits of APIC ID */ u16 pnode; /* Index to MMR and GRU spaces */ - u32 pxm; /* ACPI proximity domain number */ + u32 unused2; u32 limit; /* PA bits 56:26 (UV_GAM_RANGE_SHFT) */ }; @@ -88,7 +88,8 @@ struct uv_gam_range_entry { #define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */ #define UV_SYSTAB_VERSION_UV4_1 0x401 /* + gpa_shift */ #define UV_SYSTAB_VERSION_UV4_2 0x402 /* + TYPE_NVRAM/WINDOW/MBOX */ -#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_2 +#define UV_SYSTAB_VERSION_UV4_3 0x403 /* - GAM Range PXM Value */ +#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_3 #define UV_SYSTAB_TYPE_UNUSED 0 /* End of table (offset == 0) */ #define UV_SYSTAB_TYPE_GAM_PARAMS 1 /* GAM PARAM conversions */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 20abd912f0e4..cea4fc19e844 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -313,7 +313,7 @@ int lapic_get_maxlvt(void) /* Clock divisor */ #define APIC_DIVISOR 16 -#define TSC_DIVISOR 32 +#define TSC_DIVISOR 8 /* * This function sets up the local APIC timer, with a timeout of @@ -565,13 +565,37 @@ static void setup_APIC_timer(void) CLOCK_EVT_FEAT_DUMMY); levt->set_next_event = lapic_next_deadline; clockevents_config_and_register(levt, - (tsc_khz / TSC_DIVISOR) * 1000, + tsc_khz * (1000 / TSC_DIVISOR), 0xF, ~0UL); } else clockevents_register_device(levt); } /* + * Install the updated TSC frequency from recalibration at the TSC + * deadline clockevent devices. + */ +static void __lapic_update_tsc_freq(void *info) +{ + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); + + if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return; + + clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR)); +} + +void lapic_update_tsc_freq(void) +{ + /* + * The clockevent device's ->mult and ->shift can both be + * changed. In order to avoid races, schedule the frequency + * update code on each CPU. + */ + on_each_cpu(__lapic_update_tsc_freq, NULL, 0); +} + +/* * In this functions we calibrate APIC bus clocks to the external timer. * * We want to do the calibration only once since we want to have local timer diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 6368fa69d2af..54f35d988025 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -155,7 +155,7 @@ static void init_x2apic_ldr(void) /* * At CPU state changes, update the x2apic cluster sibling info. */ -int x2apic_prepare_cpu(unsigned int cpu) +static int x2apic_prepare_cpu(unsigned int cpu) { if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL)) return -ENOMEM; @@ -168,7 +168,7 @@ int x2apic_prepare_cpu(unsigned int cpu) return 0; } -int x2apic_dead_cpu(unsigned int this_cpu) +static int x2apic_dead_cpu(unsigned int this_cpu) { int cpu; @@ -186,13 +186,18 @@ int x2apic_dead_cpu(unsigned int this_cpu) static int x2apic_cluster_probe(void) { int cpu = smp_processor_id(); + int ret; if (!x2apic_mode) return 0; + ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE", + x2apic_prepare_cpu, x2apic_dead_cpu); + if (ret < 0) { + pr_err("Failed to register X2APIC_PREPARE\n"); + return 0; + } cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); - cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE", - x2apic_prepare_cpu, x2apic_dead_cpu); return 1; } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 09b59adaea3f..cb0673c1e940 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -223,6 +223,11 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (strncmp(oem_id, "SGI", 3) != 0) return 0; + if (numa_off) { + pr_err("UV: NUMA is off, disabling UV support\n"); + return 0; + } + /* Setup early hub type field in uv_hub_info for Node 0 */ uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; @@ -325,7 +330,7 @@ static __init void build_uv_gr_table(void) struct uv_gam_range_entry *gre = uv_gre_table; struct uv_gam_range_s *grt; unsigned long last_limit = 0, ram_limit = 0; - int bytes, i, sid, lsid = -1; + int bytes, i, sid, lsid = -1, indx = 0, lindx = -1; if (!gre) return; @@ -356,11 +361,12 @@ static __init void build_uv_gr_table(void) } sid = gre->sockid - _min_socket; if (lsid < sid) { /* new range */ - grt = &_gr_table[sid]; - grt->base = lsid; + grt = &_gr_table[indx]; + grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; lsid = sid; + lindx = indx++; continue; } if (lsid == sid && !ram_limit) { /* update range */ @@ -371,7 +377,7 @@ static __init void build_uv_gr_table(void) } if (!ram_limit) { /* non-contiguous ram range */ grt++; - grt->base = sid - 1; + grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; continue; @@ -1155,19 +1161,18 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (!index) { pr_info("UV: GAM Range Table...\n"); - pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s %3s\n", + pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", - "SID", "PN", "PXM"); + "SID", "PN"); } pr_info( - "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x %3d\n", + "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n", index++, (unsigned long)lgre << UV_GAM_RANGE_SHFT, (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, ((unsigned long)(gre->limit - lgre)) >> (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */ - gre->type, gre->nasid, gre->sockid, - gre->pnode, gre->pxm); + gre->type, gre->nasid, gre->sockid, gre->pnode); lgre = gre->limit; if (sock_min > gre->sockid) @@ -1286,7 +1291,7 @@ static void __init build_socket_tables(void) _pnode_to_socket[i] = SOCK_EMPTY; /* fill in pnode/node/addr conversion list values */ - pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n"); + pr_info("UV: GAM Building socket/pnode conversion tables\n"); for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (gre->type == UV_GAM_RANGE_TYPE_HOLE) continue; @@ -1294,20 +1299,18 @@ static void __init build_socket_tables(void) if (_socket_to_pnode[i] != SOCK_EMPTY) continue; /* duplicate */ _socket_to_pnode[i] = gre->pnode; - _socket_to_node[i] = gre->pxm; i = gre->pnode - minpnode; _pnode_to_socket[i] = gre->sockid; pr_info( - "UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n", + "UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", gre->sockid, gre->type, gre->nasid, _socket_to_pnode[gre->sockid - minsock], - _socket_to_node[gre->sockid - minsock], _pnode_to_socket[gre->pnode - minpnode]); } - /* check socket -> node values */ + /* Set socket -> node values */ lnid = -1; for_each_present_cpu(cpu) { int nid = cpu_to_node(cpu); @@ -1318,14 +1321,9 @@ static void __init build_socket_tables(void) lnid = nid; apicid = per_cpu(x86_cpu_to_apicid, cpu); sockid = apicid >> uv_cpuid.socketid_shift; - i = sockid - minsock; - - if (nid != _socket_to_node[i]) { - pr_warn( - "UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n", - i, sockid, gre->type, _socket_to_node[i], nid); - _socket_to_node[i] = nid; - } + _socket_to_node[sockid - minsock] = nid; + pr_info("UV: sid:%02x: apicid:%04x node:%2d\n", + sockid, apicid, nid); } /* Setup physical blade to pnode translation from GAM Range Table */ diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 27a0228c9cae..b816971f5da4 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -355,6 +355,7 @@ void load_ucode_amd_ap(void) unsigned int cpu = smp_processor_id(); struct equiv_cpu_entry *eq; struct microcode_amd *mc; + u8 *cont = container; u32 rev, eax; u16 eq_id; @@ -371,8 +372,11 @@ void load_ucode_amd_ap(void) if (check_current_patch_level(&rev, false)) return; + /* Add CONFIG_RANDOMIZE_MEMORY offset. */ + cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; + eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); + eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ); eq_id = find_equiv_id(eq, eax); if (!eq_id) @@ -434,6 +438,9 @@ int __init save_microcode_in_initrd_amd(void) else container = cont_va; + /* Add CONFIG_RANDOMIZE_MEMORY offset. */ + container += PAGE_OFFSET - __PAGE_OFFSET_BASE; + eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 680049aa4593..01567aa87503 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -866,105 +866,17 @@ const void *get_xsave_field_ptr(int xsave_state) return get_xsave_addr(&fpu->state.xsave, xsave_state); } - -/* - * Set xfeatures (aka XSTATE_BV) bit for a feature that we want - * to take out of its "init state". This will ensure that an - * XRSTOR actually restores the state. - */ -static void fpu__xfeature_set_non_init(struct xregs_state *xsave, - int xstate_feature_mask) -{ - xsave->header.xfeatures |= xstate_feature_mask; -} - -/* - * This function is safe to call whether the FPU is in use or not. - * - * Note that this only works on the current task. - * - * Inputs: - * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, - * XFEATURE_MASK_SSE, etc...) - * @xsave_state_ptr: a pointer to a copy of the state that you would - * like written in to the current task's FPU xsave state. This pointer - * must not be located in the current tasks's xsave area. - * Output: - * address of the state in the xsave area or NULL if the state - * is not present or is in its 'init state'. - */ -static void fpu__xfeature_set_state(int xstate_feature_mask, - void *xstate_feature_src, size_t len) -{ - struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; - struct fpu *fpu = ¤t->thread.fpu; - void *dst; - - if (!boot_cpu_has(X86_FEATURE_XSAVE)) { - WARN_ONCE(1, "%s() attempted with no xsave support", __func__); - return; - } - - /* - * Tell the FPU code that we need the FPU state to be in - * 'fpu' (not in the registers), and that we need it to - * be stable while we write to it. - */ - fpu__current_fpstate_write_begin(); - - /* - * This method *WILL* *NOT* work for compact-format - * buffers. If the 'xstate_feature_mask' is unset in - * xcomp_bv then we may need to move other feature state - * "up" in the buffer. - */ - if (xsave->header.xcomp_bv & xstate_feature_mask) { - WARN_ON_ONCE(1); - goto out; - } - - /* find the location in the xsave buffer of the desired state */ - dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask); - - /* - * Make sure that the pointer being passed in did not - * come from the xsave buffer itself. - */ - WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself"); - - /* put the caller-provided data in the location */ - memcpy(dst, xstate_feature_src, len); - - /* - * Mark the xfeature so that the CPU knows there is state - * in the buffer now. - */ - fpu__xfeature_set_non_init(xsave, xstate_feature_mask); -out: - /* - * We are done writing to the 'fpu'. Reenable preeption - * and (possibly) move the fpstate back in to the fpregs. - */ - fpu__current_fpstate_write_end(); -} - #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) /* - * This will go out and modify the XSAVE buffer so that PKRU is - * set to a particular state for access to 'pkey'. - * - * PKRU state does affect kernel access to user memory. We do - * not modfiy PKRU *itself* here, only the XSAVE state that will - * be restored in to PKRU when we return back to userspace. + * This will go out and modify PKRU register to set the access + * rights for @pkey to @init_val. */ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; - struct pkru_state *old_pkru_state; - struct pkru_state new_pkru_state; + u32 old_pkru; int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); u32 new_pkru_bits = 0; @@ -974,6 +886,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; + /* + * For most XSAVE components, this would be an arduous task: + * brining fpstate up to date with fpregs, updating fpstate, + * then re-populating fpregs. But, for components that are + * never lazily managed, we can just access the fpregs + * directly. PKRU is never managed lazily, so we can just + * manipulate it directly. Make sure it stays that way. + */ + WARN_ON_ONCE(!use_eager_fpu()); /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) @@ -984,37 +905,12 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, /* Shift the bits in to the correct place in PKRU for pkey: */ new_pkru_bits <<= pkey_shift; - /* Locate old copy of the state in the xsave buffer: */ - old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU); - - /* - * When state is not in the buffer, it is in the init - * state, set it manually. Otherwise, copy out the old - * state. - */ - if (!old_pkru_state) - new_pkru_state.pkru = 0; - else - new_pkru_state.pkru = old_pkru_state->pkru; - - /* Mask off any old bits in place: */ - new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); - - /* Set the newly-requested bits: */ - new_pkru_state.pkru |= new_pkru_bits; - - /* - * We could theoretically live without zeroing pkru.pad. - * The current XSAVE feature state definition says that - * only bytes 0->3 are used. But we do not want to - * chance leaking kernel stack out to userspace in case a - * memcpy() of the whole xsave buffer was done. - * - * They're in the same cacheline anyway. - */ - new_pkru_state.pad = 0; + /* Get old PKRU and mask off any old bits in place: */ + old_pkru = read_pkru(); + old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); - fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, sizeof(new_pkru_state)); + /* Write old part along with new part: */ + write_pkru(old_pkru | new_pkru_bits); return 0; } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 2dda0bc4576e..f16c55bfc090 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -25,8 +25,6 @@ static void __init i386_default_early_setup(void) /* Initialize 32bit specific setup functions */ x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; - - reserve_bios_regions(); } asmlinkage __visible void __init i386_start_kernel(void) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 99d48e7d2974..54a2372f5dbb 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -183,7 +183,6 @@ void __init x86_64_start_reservations(char *real_mode_data) copy_bootdata(__va(real_mode_data)); x86_early_init_platform_quirks(); - reserve_bios_regions(); switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ed16e58658a4..c6dfd801df97 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1242,7 +1242,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) memset(&curr_time, 0, sizeof(struct rtc_time)); if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - mc146818_set_time(&curr_time); + mc146818_get_time(&curr_time); if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 61521dc19c10..9f669fdd2010 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -102,8 +102,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_puts(p, " Rescheduling interrupts\n"); seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - - irq_stats(j)->irq_tlb_count); + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); seq_puts(p, " Function call interrupts\n"); seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 991b77986d57..0fa60f5f5a16 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -936,8 +936,6 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); - kernel_randomize_memory(); - iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); @@ -1055,6 +1053,12 @@ void __init setup_arch(char **cmdline_p) max_possible_pfn = max_pfn; + /* + * Define random base addresses for memory sections after max_pfn is + * defined and before each memory section base is used. + */ + kernel_randomize_memory(); + #ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ find_low_pfn_range(); @@ -1097,6 +1101,8 @@ void __init setup_arch(char **cmdline_p) efi_find_mirror(); } + reserve_bios_regions(); + /* * The EFI specification says that boot service code won't be called * after ExitBootServices(). This is, in fact, a lie. @@ -1125,7 +1131,15 @@ void __init setup_arch(char **cmdline_p) early_trap_pf_init(); - setup_real_mode(); + /* + * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) + * with the current CR4 value. This may not be necessary, but + * auditing all the early-boot CR4 manipulation would be needed to + * rule it out. + */ + if (boot_cpu_data.cpuid_level >= 0) + /* A CPU has %cr4 if and only if it has CPUID. */ + mmu_cr4_features = __read_cr4(); memblock_set_current_limit(get_max_mapped()); @@ -1174,13 +1188,6 @@ void __init setup_arch(char **cmdline_p) kasan_init(); - if (boot_cpu_data.cpuid_level >= 0) { - /* A CPU has %cr4 if and only if it has CPUID */ - mmu_cr4_features = __read_cr4(); - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - } - #ifdef CONFIG_X86_32 /* sync back kernel address range */ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2a6e84a30a54..4296beb8fdd3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -100,10 +100,11 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); /* Logical package management. We might want to allocate that dynamically */ static int *physical_to_logical_pkg __read_mostly; static unsigned long *physical_package_map __read_mostly;; -static unsigned long *logical_package_map __read_mostly; static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); +static unsigned int logical_packages __read_mostly; +static bool logical_packages_frozen __read_mostly; /* Maximum number of SMT threads on any online core */ int __max_smt_threads __read_mostly; @@ -277,14 +278,14 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu) if (test_and_set_bit(pkg, physical_package_map)) goto found; - new = find_first_zero_bit(logical_package_map, __max_logical_packages); - if (new >= __max_logical_packages) { + if (logical_packages_frozen) { physical_to_logical_pkg[pkg] = -1; - pr_warn("APIC(%x) Package %u exceeds logical package map\n", + pr_warn("APIC(%x) Package %u exceeds logical package max\n", apicid, pkg); return -ENOSPC; } - set_bit(new, logical_package_map); + + new = logical_packages++; pr_info("APIC(%x) Converting physical %u to logical package %u\n", apicid, pkg, new); physical_to_logical_pkg[pkg] = new; @@ -341,6 +342,7 @@ static void __init smp_init_package_map(void) } __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); + logical_packages = 0; /* * Possibly larger than what we need as the number of apic ids per @@ -352,10 +354,6 @@ static void __init smp_init_package_map(void) memset(physical_to_logical_pkg, 0xff, size); size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); physical_package_map = kzalloc(size, GFP_KERNEL); - size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long); - logical_package_map = kzalloc(size, GFP_KERNEL); - - pr_info("Max logical packages: %u\n", __max_logical_packages); for_each_present_cpu(cpu) { unsigned int apicid = apic->cpu_present_to_apicid(cpu); @@ -369,6 +367,15 @@ static void __init smp_init_package_map(void) set_cpu_possible(cpu, false); set_cpu_present(cpu, false); } + + if (logical_packages > __max_logical_packages) { + pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n", + logical_packages, __max_logical_packages); + logical_packages_frozen = true; + __max_logical_packages = logical_packages; + } + + pr_info("Max logical packages: %u\n", __max_logical_packages); } void __init smp_store_boot_cpu_info(void) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 1ef87e887051..78b9cb5a26af 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -22,6 +22,7 @@ #include <asm/nmi.h> #include <asm/x86_init.h> #include <asm/geode.h> +#include <asm/apic.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -1249,6 +1250,9 @@ static void tsc_refine_calibration_work(struct work_struct *work) (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); + /* Inform the TSC deadline clockevent devices about the recalibration */ + lapic_update_tsc_freq(); + out: if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c1ff31d99ff..495c776de4b4 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) *cursor &= 0xfe; } /* - * Similar treatment for VEX3 prefix. - * TODO: add XOP/EVEX treatment when insn decoder supports them + * Similar treatment for VEX3/EVEX prefix. + * TODO: add XOP treatment when insn decoder supports them */ - if (insn->vex_prefix.nbytes == 3) { + if (insn->vex_prefix.nbytes >= 3) { /* * vex2: c5 rvvvvLpp (has no b bit) * vex3/xop: c4/8f rxbmmmmm wvvvvLpp * evex: 62 rxbR00mm wvvvv1pp zllBVaaa - * (evex will need setting of both b and x since - * in non-sib encoding evex.x is 4th bit of MODRM.rm) - * Setting VEX3.b (setting because it has inverted meaning): + * Setting VEX3.b (setting because it has inverted meaning). + * Setting EVEX.x since (in non-SIB encoding) EVEX.x + * is the 4th bit of MODRM.rm, and needs the same treatment. + * For VEX3-encoded insns, VEX3.x value has no effect in + * non-SIB encoding, the change is superfluous but harmless. */ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; - *cursor |= 0x20; + *cursor |= 0x60; } /* @@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) reg = MODRM_REG(insn); /* Fetch modrm.reg */ reg2 = 0xff; /* Fetch vex.vvvv */ - if (insn->vex_prefix.nbytes == 2) - reg2 = insn->vex_prefix.bytes[1]; - else if (insn->vex_prefix.nbytes == 3) + if (insn->vex_prefix.nbytes) reg2 = insn->vex_prefix.bytes[2]; /* - * TODO: add XOP, EXEV vvvv reading. + * TODO: add XOP vvvv reading. * * vex.vvvv field is in bits 6-3, bits are inverted. * But in 32-bit mode, high-order bit may be ignored. diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S index 02de3d74d2c5..8a602a1e404a 100644 --- a/arch/x86/lib/hweight.S +++ b/arch/x86/lib/hweight.S @@ -35,6 +35,7 @@ ENDPROC(__sw_hweight32) ENTRY(__sw_hweight64) #ifdef CONFIG_X86_64 + pushq %rdi pushq %rdx movq %rdi, %rdx # w -> t @@ -60,6 +61,7 @@ ENTRY(__sw_hweight64) shrq $56, %rax # w = w_tmp >> 56 popq %rdx + popq %rdi ret #else /* CONFIG_X86_32 */ /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index f7dfeda83e5c..121f59c6ee54 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -19,7 +19,7 @@ #include <asm/cpufeature.h> #include <asm/setup.h> -#define debug_putstr(v) early_printk(v) +#define debug_putstr(v) early_printk("%s", v) #define has_cpuflag(f) boot_cpu_has(f) #define get_boot_seed() kaslr_offset() #endif diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index ec21796ac5fd..4473cb4f8b90 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -3,15 +3,17 @@ * included by both the compressed kernel and the regular kernel. */ -static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, +static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page, unsigned long addr, unsigned long end) { addr &= PMD_MASK; for (; addr < end; addr += PMD_SIZE) { pmd_t *pmd = pmd_page + pmd_index(addr); - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | pmd_flag)); + if (pmd_present(*pmd)) + continue; + + set_pmd(pmd, __pmd((addr - info->offset) | info->pmd_flag)); } } @@ -30,13 +32,13 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, if (pud_present(*pud)) { pmd = pmd_offset(pud, 0); - ident_pmd_init(info->pmd_flag, pmd, addr, next); + ident_pmd_init(info, pmd, addr, next); continue; } pmd = (pmd_t *)info->alloc_pgt_page(info->context); if (!pmd) return -ENOMEM; - ident_pmd_init(info->pmd_flag, pmd, addr, next); + ident_pmd_init(info, pmd, addr, next); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); } @@ -44,14 +46,15 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, } int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, - unsigned long addr, unsigned long end) + unsigned long pstart, unsigned long pend) { + unsigned long addr = pstart + info->offset; + unsigned long end = pend + info->offset; unsigned long next; int result; - int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; for (; addr < end; addr = next) { - pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pgd_t *pgd = pgd_page + pgd_index(addr); pud_t *pud; next = (addr & PGDIR_MASK) + PGDIR_SIZE; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 620928903be3..d28a2d741f9e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -122,8 +122,18 @@ __ref void *alloc_low_pages(unsigned int num) return __va(pfn << PAGE_SHIFT); } -/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */ -#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE) +/* + * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. + * With KASLR memory randomization, depending on the machine e820 memory + * and the PUD alignment. We may need twice more pages when KASLR memory + * randomization is enabled. + */ +#ifndef CONFIG_RANDOMIZE_MEMORY +#define INIT_PGD_PAGE_COUNT 6 +#else +#define INIT_PGD_PAGE_COUNT 12 +#endif +#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) { diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 26dccd6c0df1..ec8654f117d8 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -97,7 +97,7 @@ void __init kernel_randomize_memory(void) * add padding if needed (especially for memory hotplug support). */ BUG_ON(kaslr_regions[0].base != &page_offset_base); - memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT) + + memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; /* Adapt phyiscal memory region size based on available memory */ diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 4480c06cade7..89d1146f5a6f 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -254,6 +254,7 @@ void __init efi_free_boot_services(void) for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + size_t rm_size; if (md->type != EFI_BOOT_SERVICES_CODE && md->type != EFI_BOOT_SERVICES_DATA) @@ -263,6 +264,26 @@ void __init efi_free_boot_services(void) if (md->attribute & EFI_MEMORY_RUNTIME) continue; + /* + * Nasty quirk: if all sub-1MB memory is used for boot + * services, we can get here without having allocated the + * real mode trampoline. It's too late to hand boot services + * memory back to the memblock allocator, so instead + * try to manually allocate the trampoline if needed. + * + * I've seen this on a Dell XPS 13 9350 with firmware + * 1.4.4 with SGX enabled booting Linux via Fedora 24's + * grub2-efi on a hard disk. (And no, I don't know why + * this happened, but Linux should still try to boot rather + * panicing early.) + */ + rm_size = real_mode_size_needed(); + if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) { + set_real_mode_mem(start, rm_size); + start += rm_size; + size -= rm_size; + } + free_bootmem_late(start, size); } diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 66b2166ea4a1..23f2f3e41c7f 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -187,7 +187,8 @@ EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); void uv_bios_init(void) { uv_systab = NULL; - if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab) { + if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || + !efi.uv_systab || efi_runtime_disabled()) { pr_crit("UV: UVsystab: missing\n"); return; } @@ -199,12 +200,14 @@ void uv_bios_init(void) return; } + /* Starting with UV4 the UV systab size is variable */ if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) { + int size = uv_systab->size; + iounmap(uv_systab); - uv_systab = ioremap(efi.uv_systab, uv_systab->size); + uv_systab = ioremap(efi.uv_systab, size); if (!uv_systab) { - pr_err("UV: UVsystab: ioremap(%d) failed!\n", - uv_systab->size); + pr_err("UV: UVsystab: ioremap(%d) failed!\n", size); return; } } diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index f0b5f2d402af..9634557a5444 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -87,7 +87,7 @@ static int set_up_temporary_mappings(void) struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, - .kernel_mapping = true, + .offset = __PAGE_OFFSET, }; unsigned long mstart, mend; pgd_t *pgd; @@ -113,7 +113,7 @@ static int set_up_temporary_mappings(void) return result; } - temp_level4_pgt = (unsigned long)pgd - __PAGE_OFFSET; + temp_level4_pgt = __pa(pgd); return 0; } diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 705e3fffb4a1..5db706f14111 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -1,9 +1,11 @@ #include <linux/io.h> +#include <linux/slab.h> #include <linux/memblock.h> #include <asm/cacheflush.h> #include <asm/pgtable.h> #include <asm/realmode.h> +#include <asm/tlbflush.h> struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; @@ -11,25 +13,37 @@ u32 *trampoline_cr4_features; /* Hold the pgd entry used on booting additional CPUs */ pgd_t trampoline_pgd_entry; +void __init set_real_mode_mem(phys_addr_t mem, size_t size) +{ + void *base = __va(mem); + + real_mode_header = (struct real_mode_header *) base; + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + base, (unsigned long long)mem, size); +} + void __init reserve_real_mode(void) { phys_addr_t mem; - unsigned char *base; - size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); + size_t size = real_mode_size_needed(); + + if (!size) + return; + + WARN_ON(slab_is_available()); /* Has to be under 1M so we can execute real-mode AP code. */ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (!mem) - panic("Cannot allocate trampoline\n"); + if (!mem) { + pr_info("No sub-1M memory is available for the trampoline\n"); + return; + } - base = __va(mem); memblock_reserve(mem, size); - real_mode_header = (struct real_mode_header *) base; - printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", - base, (unsigned long long)mem, size); + set_real_mode_mem(mem, size); } -void __init setup_real_mode(void) +static void __init setup_real_mode(void) { u16 real_mode_seg; const u32 *rel; @@ -84,7 +98,7 @@ void __init setup_real_mode(void) trampoline_header->start = (u64) secondary_startup_64; trampoline_cr4_features = &trampoline_header->cr4; - *trampoline_cr4_features = __read_cr4(); + *trampoline_cr4_features = mmu_cr4_features; trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = trampoline_pgd_entry.pgd; @@ -100,7 +114,7 @@ void __init setup_real_mode(void) * need to mark it executable at do_pre_smp_initcalls() at least, * thus run it as a early_initcall(). */ -static int __init set_real_mode_permissions(void) +static void __init set_real_mode_permissions(void) { unsigned char *base = (unsigned char *) real_mode_header; size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); @@ -119,7 +133,16 @@ static int __init set_real_mode_permissions(void) set_memory_nx((unsigned long) base, size >> PAGE_SHIFT); set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT); set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); +} + +static int __init init_real_mode(void) +{ + if (!real_mode_header) + panic("Real mode trampoline was not allocated"); + + setup_real_mode(); + set_real_mode_permissions(); return 0; } -early_initcall(set_real_mode_permissions); +early_initcall(init_real_mode); |