diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/asm-offsets.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/asm-offsets_32.c | 9 | ||||
-rw-r--r-- | arch/x86/kernel/asm-offsets_64.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 170 | ||||
-rw-r--r-- | arch/x86/kernel/doublefault.c | 36 | ||||
-rw-r--r-- | arch/x86/kernel/dumpstack.c | 74 | ||||
-rw-r--r-- | arch/x86/kernel/dumpstack_32.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/dumpstack_64.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/ioport.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/irq.c | 12 | ||||
-rw-r--r-- | arch/x86/kernel/irq_64.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/paravirt_patch_64.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/process.c | 19 | ||||
-rw-r--r-- | arch/x86/kernel/process_32.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/process_64.c | 14 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 69 | ||||
-rw-r--r-- | arch/x86/kernel/unwind_orc.c | 88 | ||||
-rw-r--r-- | arch/x86/kernel/vmlinux.lds.S | 9 |
18 files changed, 336 insertions, 196 deletions
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 8ea78275480d..cd360a5e0dca 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -93,4 +93,10 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); + OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); + DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index dedf428b20b6..7d20d9c0b3d6 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -47,13 +47,8 @@ void foo(void) BLANK(); /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - - offsetofend(struct tss_struct, SYSENTER_stack)); - - /* Offset from cpu_tss to SYSENTER_stack */ - OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); - /* Size of SYSENTER_stack */ - DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - + offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); #ifdef CONFIG_CC_STACKPROTECTOR BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 630212fa9b9d..bf51e51d808d 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -23,6 +23,9 @@ int main(void) #ifdef CONFIG_PARAVIRT OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); +#ifdef CONFIG_DEBUG_ENTRY + OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl); +#endif BLANK(); #endif @@ -63,6 +66,7 @@ int main(void) OFFSET(TSS_ist, tss_struct, x86_tss.ist); OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); + OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); BLANK(); #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fa998ca8aa5a..7416da3ec4df 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -476,8 +476,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS]; -__u32 cpu_caps_set[NCAPINTS]; +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; +__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; void load_percpu_segment(int cpu) { @@ -490,27 +490,116 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } -/* Setup the fixmap mapping only once per-processor */ -static inline void setup_fixmap_gdt(int cpu) +#ifdef CONFIG_X86_32 +/* The 32-bit entry code needs to find cpu_entry_area. */ +DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); +#endif + +#ifdef CONFIG_X86_64 +/* + * Special IST stacks which the CPU switches to when it calls + * an IST-marked descriptor entry. Up to 7 stacks (hardware + * limit), all of them are 4K, except the debug stack which + * is 8K. + */ +static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ +}; + +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +#endif + +static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, + SYSENTER_stack_storage); + +static void __init +set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) +{ + for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) + __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); +} + +/* Setup the fixmap mappings only once per-processor */ +static void __init setup_cpu_entry_area(int cpu) { #ifdef CONFIG_X86_64 - /* On 64-bit systems, we use a read-only fixmap GDT. */ - pgprot_t prot = PAGE_KERNEL_RO; + extern char _entry_trampoline[]; + + /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ + pgprot_t gdt_prot = PAGE_KERNEL_RO; + pgprot_t tss_prot = PAGE_KERNEL_RO; #else /* * On native 32-bit systems, the GDT cannot be read-only because * our double fault handler uses a task gate, and entering through - * a task gate needs to change an available TSS to busy. If the GDT - * is read-only, that will triple fault. + * a task gate needs to change an available TSS to busy. If the + * GDT is read-only, that will triple fault. The TSS cannot be + * read-only because the CPU writes to it on task switches. * - * On Xen PV, the GDT must be read-only because the hypervisor requires - * it. + * On Xen PV, the GDT must be read-only because the hypervisor + * requires it. */ - pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? + pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? PAGE_KERNEL_RO : PAGE_KERNEL; + pgprot_t tss_prot = PAGE_KERNEL; #endif - __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); + __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), + per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, + PAGE_KERNEL); + + /* + * The Intel SDM says (Volume 3, 7.2.1): + * + * Avoid placing a page boundary in the part of the TSS that the + * processor reads during a task switch (the first 104 bytes). The + * processor may not correctly perform address translations if a + * boundary occurs in this area. During a task switch, the processor + * reads and writes into the first 104 bytes of each TSS (using + * contiguous physical addresses beginning with the physical address + * of the first byte of the TSS). So, after TSS access begins, if + * part of the 104 bytes is not physically contiguous, the processor + * will access incorrect information without generating a page-fault + * exception. + * + * There are also a lot of errata involving the TSS spanning a page + * boundary. Assert that we're not doing that. + */ + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), + &per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct) / PAGE_SIZE, + tss_prot); + +#ifdef CONFIG_X86_32 + per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); +#endif + +#ifdef CONFIG_X86_64 + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + BUILD_BUG_ON(sizeof(exception_stacks) != + sizeof(((struct cpu_entry_area *)0)->exception_stacks)); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), + &per_cpu(exception_stacks, cpu), + sizeof(exception_stacks) / PAGE_SIZE, + PAGE_KERNEL); + + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); +#endif +} + +void __init setup_cpu_entry_areas(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + setup_cpu_entry_area(cpu); } /* Load the original GDT from the per-cpu structure */ @@ -747,7 +836,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c) { int i; - for (i = 0; i < NCAPINTS; i++) { + for (i = 0; i < NCAPINTS + NBUGINTS; i++) { c->x86_capability[i] &= ~cpu_caps_cleared[i]; c->x86_capability[i] |= cpu_caps_set[i]; } @@ -1250,7 +1339,7 @@ void enable_sep_cpu(void) return; cpu = get_cpu(); - tss = &per_cpu(cpu_tss, cpu); + tss = &per_cpu(cpu_tss_rw, cpu); /* * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- @@ -1259,11 +1348,7 @@ void enable_sep_cpu(void) tss->x86_tss.ss1 = __KERNEL_CS; wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - - wrmsr(MSR_IA32_SYSENTER_ESP, - (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), - 0); - + wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); put_cpu(); @@ -1357,25 +1442,19 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ -}; - -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); - /* May not be marked __init: used by software suspend */ void syscall_init(void) { + extern char _entry_trampoline[]; + extern char entry_SYSCALL_64_trampoline[]; + + int cpu = smp_processor_id(); + unsigned long SYSCALL64_entry_trampoline = + (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + + (entry_SYSCALL_64_trampoline - _entry_trampoline); + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); @@ -1386,7 +1465,7 @@ void syscall_init(void) * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); @@ -1530,7 +1609,7 @@ void cpu_init(void) if (cpu) load_ucode_ap(); - t = &per_cpu(cpu_tss, cpu); + t = &per_cpu(cpu_tss_rw, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA @@ -1569,7 +1648,7 @@ void cpu_init(void) * set up and load the per-CPU TSS */ if (!oist->ist[0]) { - char *estacks = per_cpu(exception_stacks, cpu); + char *estacks = get_cpu_entry_area(cpu)->exception_stacks; for (v = 0; v < N_EXCEPTION_STACKS; v++) { estacks += exception_stack_sizes[v]; @@ -1580,7 +1659,7 @@ void cpu_init(void) } } - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; /* * <= is required because the CPU will access up to @@ -1596,11 +1675,12 @@ void cpu_init(void) enter_lazy_tlb(&init_mm, me); /* - * Initialize the TSS. Don't bother initializing sp0, as the initial - * task never enters user mode. + * Initialize the TSS. sp0 points to the entry trampoline stack + * regardless of what task is running. */ - set_tss_desc(cpu, t); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); + load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); load_mm_ldt(&init_mm); @@ -1612,7 +1692,6 @@ void cpu_init(void) if (is_uv_system()) uv_cpu_init(); - setup_fixmap_gdt(cpu); load_fixmap_gdt(cpu); } @@ -1622,7 +1701,7 @@ void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(cpu_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); wait_for_master_cpu(cpu); @@ -1657,12 +1736,12 @@ void cpu_init(void) * Initialize the TSS. Don't bother initializing sp0, as the initial * task never enters user mode. */ - set_tss_desc(cpu, t); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); load_mm_ldt(&init_mm); - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; #ifdef CONFIG_DOUBLEFAULT /* Set up doublefault TSS pointer in the GDT */ @@ -1674,7 +1753,6 @@ void cpu_init(void) fpu__init_cpu(); - setup_fixmap_gdt(cpu); load_fixmap_gdt(cpu); } #endif diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 0e662c55ae90..0b8cedb20d6d 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -50,25 +50,23 @@ static void doublefault_fn(void) cpu_relax(); } -struct tss_struct doublefault_tss __cacheline_aligned = { - .x86_tss = { - .sp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, - - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, - .sp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, - .fs = __KERNEL_PERCPU, - - .__cr3 = __pa_nodebug(swapper_pg_dir), - } +struct x86_hw_tss doublefault_tss __cacheline_aligned = { + .sp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, + + .ip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .flags = X86_EFLAGS_SF | 0x2, + .sp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, + + .__cr3 = __pa_nodebug(swapper_pg_dir), }; /* dummy for do_double_fault() call */ diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f13b4c00a5de..bbd6d986e2d0 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -43,6 +43,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, return true; } +bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) +{ + struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); + + void *begin = ss; + void *end = ss + 1; + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_SYSENTER; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + static void printk_stack_address(unsigned long address, int reliable, char *log_lvl) { @@ -50,6 +68,28 @@ static void printk_stack_address(unsigned long address, int reliable, printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } +void show_iret_regs(struct pt_regs *regs) +{ + printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); + printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, + regs->sp, regs->flags); +} + +static void show_regs_safe(struct stack_info *info, struct pt_regs *regs) +{ + if (on_stack(info, regs, sizeof(*regs))) + __show_regs(regs, 0); + else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET, + IRET_FRAME_SIZE)) { + /* + * When an interrupt or exception occurs in entry code, the + * full pt_regs might not have been saved yet. In that case + * just print the iret frame. + */ + show_iret_regs(regs); + } +} + void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl) { @@ -71,31 +111,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - task stack * - interrupt stack * - HW exception stacks (double fault, nmi, debug, mce) + * - SYSENTER stack * - * x86-32 can have up to three stacks: + * x86-32 can have up to four stacks: * - task stack * - softirq stack * - hardirq stack + * - SYSENTER stack */ for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; - /* - * If we overflowed the task stack into a guard page, jump back - * to the bottom of the usable stack. - */ - if (task_stack_page(task) - (void *)stack < PAGE_SIZE) - stack = task_stack_page(task); - - if (get_stack_info(stack, task, &stack_info, &visit_mask)) - break; + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); + if (get_stack_info(stack, task, &stack_info, &visit_mask)) + break; + } stack_name = stack_type_name(stack_info.type); if (stack_name) printk("%s <%s>\n", log_lvl, stack_name); - if (regs && on_stack(&stack_info, regs, sizeof(*regs))) - __show_regs(regs, 0); + if (regs) + show_regs_safe(&stack_info, regs); /* * Scan the stack, printing any text addresses we find. At the @@ -119,7 +163,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* * Don't print regs->ip again if it was already printed - * by __show_regs() below. + * by show_regs_safe() below. */ if (regs && stack == ®s->ip) goto next; @@ -155,8 +199,8 @@ next: /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state); - if (regs && on_stack(&stack_info, regs, sizeof(*regs))) - __show_regs(regs, 0); + if (regs) + show_regs_safe(&stack_info, regs); } if (stack_name) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index daefae83a3aa..5ff13a6b3680 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_SOFTIRQ) return "SOFTIRQ"; + if (type == STACK_TYPE_SYSENTER) + return "SYSENTER"; + return NULL; } @@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (task != current) goto unknown; + if (in_sysenter_stack(stack, info)) + goto recursion_check; + if (in_hardirq_stack(stack, info)) goto recursion_check; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 88ce2ffdb110..abc828f8c297 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -37,6 +37,9 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_IRQ) return "IRQ"; + if (type == STACK_TYPE_SYSENTER) + return "SYSENTER"; + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) return exception_stack_names[type - STACK_TYPE_EXCEPTION]; @@ -115,6 +118,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (in_irq_stack(stack, info)) goto recursion_check; + if (in_sysenter_stack(stack, info)) + goto recursion_check; + goto unknown; recursion_check: diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 3feb648781c4..2f723301eb58 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * because the ->io_bitmap_max value must match the bitmap * contents: */ - tss = &per_cpu(cpu_tss, get_cpu()); + tss = &per_cpu(cpu_tss_rw, get_cpu()); if (turn_on) bitmap_clear(t->io_bitmap_ptr, from, num); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 49cfd9fe7589..68e1867cca80 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) /* high bit used in ret_from_ code */ unsigned vector = ~regs->orig_ax; - /* - * NB: Unlike exception entries, IRQ entries do not reliably - * handle context tracking in the low-level entry code. This is - * because syscall entries execute briefly with IRQs on before - * updating context tracking state, so we can take an IRQ from - * kernel mode with CONTEXT_USER. The low-level entry code only - * updates the context if we came from user mode, so we won't - * switch to CONTEXT_KERNEL. We'll fix that once the syscall - * code is cleaned up enough that we can cleanly defer enabling - * IRQs. - */ - entering_irq(); /* entering_irq() tells RCU that we're not quiescent. Check it. */ diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 020efbf5786b..d86e344f5b3d 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (regs->sp >= estack_top && regs->sp <= estack_bottom) return; - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", current->comm, curbase, regs->sp, irq_stack_top, irq_stack_bottom, - estack_top, estack_bottom); + estack_top, estack_bottom, (void *)regs->ip); if (sysctl_panic_on_stackoverflow) panic("low stack detected by irq handler - check messages\n"); diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index ac0be8283325..9edadabf04f6 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); @@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); #if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bb988a24db92..aed9d94bd46f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -47,7 +47,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { .x86_tss = { /* * .sp0 is only used when entering ring 0 from a lower @@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { * Poison it. */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, + +#ifdef CONFIG_X86_64 + /* + * .sp1 is cpu_current_top_of_stack. The init task never + * runs user code, but cpu_current_top_of_stack should still + * be well defined before the first context switch. + */ + .sp1 = TOP_OF_INIT_STACK, +#endif + #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, @@ -71,11 +81,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { */ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif -#ifdef CONFIG_X86_32 - .SYSENTER_stack_canary = STACK_END_MAGIC, -#endif }; -EXPORT_PER_CPU_SYMBOL(cpu_tss); +EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); DEFINE_PER_CPU(bool, __tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); @@ -104,7 +111,7 @@ void exit_thread(struct task_struct *tsk) struct fpu *fpu = &t->fpu; if (bp) { - struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 45bf0c5f93e1..5224c6099184 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index eeeb34f85c25..c75466232016 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); - printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, - regs->sp, regs->flags); + show_iret_regs(regs); + if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); else @@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); + if (!all) + return; + asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); @@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all) rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - if (!all) - return; - cr0 = read_cr0(); cr2 = read_cr2(); cr3 = __read_cr3(); @@ -400,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(irq_count) != -1); @@ -462,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch the PDA and FPU contexts. */ this_cpu_write(current_task, next_p); + this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); /* Reload sp0. */ update_sp0(next_p); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 989514c94a55..e98f8b66a460 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -348,9 +348,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) /* * If IRET takes a non-IST fault on the espfix64 stack, then we - * end up promoting it to a doublefault. In that case, modify - * the stack to make it look like we just entered the #GP - * handler from user space, similar to bad_iret. + * end up promoting it to a doublefault. In that case, take + * advantage of the fact that we're not using the normal (TSS.sp0) + * stack right now. We can write a fake #GP(0) frame at TSS.sp0 + * and then modify our own IRET frame so that, when we return, + * we land directly at the #GP(0) vector with the stack already + * set up according to its expectations. + * + * The net result is that our #GP handler will think that we + * entered from usermode with the bad user context. * * No need for ist_enter here because we don't use RCU. */ @@ -358,13 +364,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { - struct pt_regs *normal_regs = task_pt_regs(current); + struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; - /* Fake a #GP(0) from userspace. */ - memmove(&normal_regs->ip, (void *)regs->sp, 5*8); - normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ + /* + * regs->sp points to the failing IRET frame on the + * ESPFIX64 stack. Copy it to the entry stack. This fills + * in gpregs->ss through gpregs->ip. + * + */ + memmove(&gpregs->ip, (void *)regs->sp, 5*8); + gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ + + /* + * Adjust our frame so that we return straight to the #GP + * vector with the expected RSP value. This is safe because + * we won't enable interupts or schedule before we invoke + * general_protection, so nothing will clobber the stack + * frame we just set up. + */ regs->ip = (unsigned long)general_protection; - regs->sp = (unsigned long)&normal_regs->orig_ax; + regs->sp = (unsigned long)&gpregs->orig_ax; return; } @@ -389,7 +408,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * * Processors update CR2 whenever a page fault is detected. If a * second page fault occurs while an earlier page fault is being - * deliv- ered, the faulting linear address of the second fault will + * delivered, the faulting linear address of the second fault will * overwrite the contents of CR2 (replacing the previous * address). These updates to CR2 occur even if the page fault * results in a double fault or occurs during the delivery of a @@ -605,14 +624,15 @@ NOKPROBE_SYMBOL(do_int3); #ifdef CONFIG_X86_64 /* - * Help handler running on IST stack to switch off the IST stack if the - * interrupted code was in user mode. The actual stack switch is done in - * entry_64.S + * Help handler running on a per-cpu (IST or entry trampoline) stack + * to switch to the normal thread stack if the interrupted code was in + * user mode. The actual stack switch is done in entry_64.S */ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = task_pt_regs(current); - *regs = *eregs; + struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; + if (regs != eregs) + *regs = *eregs; return regs; } NOKPROBE_SYMBOL(sync_regs); @@ -628,13 +648,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault - * correctly, we want move our stack frame to task_pt_regs - * and we want to pretend that the exception came from the - * iret target. + * correctly, we want to move our stack frame to where it would + * be had we entered directly on the entry stack (rather than + * just below the IRET frame) and we want to pretend that the + * exception came from the IRET target. */ struct bad_iret_stack *new_stack = - container_of(task_pt_regs(current), - struct bad_iret_stack, regs); + (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the new stack. */ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); @@ -795,14 +815,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: -#if defined(CONFIG_X86_32) - /* - * This is the most likely code path that involves non-trivial use - * of the SYSENTER stack. Check that we haven't overrun it. - */ - WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, - "Overran or corrupted SYSENTER stack\n"); -#endif ist_exit(regs); } NOKPROBE_SYMBOL(do_debug); @@ -929,6 +941,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) void __init trap_init(void) { + /* Init cpu_entry_area before IST entries are set up */ + setup_cpu_entry_areas(); + idt_setup_traps(); /* diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index a3f973b2c97a..be86a865087a 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) return NULL; } -static bool stack_access_ok(struct unwind_state *state, unsigned long addr, +static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, size_t len) { struct stack_info *info = &state->stack_info; + void *addr = (void *)_addr; - /* - * If the address isn't on the current stack, switch to the next one. - * - * We may have to traverse multiple stacks to deal with the possibility - * that info->next_sp could point to an empty stack and the address - * could be on a subsequent stack. - */ - while (!on_stack(info, (void *)addr, len)) - if (get_stack_info(info->next_sp, state->task, info, - &state->stack_mask)) - return false; + if (!on_stack(info, addr, len) && + (get_stack_info(addr, state->task, info, &state->stack_mask))) + return false; return true; } @@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, return true; } -#define REGS_SIZE (sizeof(struct pt_regs)) -#define SP_OFFSET (offsetof(struct pt_regs, sp)) -#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) -#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) - static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, - unsigned long *ip, unsigned long *sp, bool full) + unsigned long *ip, unsigned long *sp) { - size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; - size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; - struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); - - if (IS_ENABLED(CONFIG_X86_64)) { - if (!stack_access_ok(state, addr, regs_size)) - return false; + struct pt_regs *regs = (struct pt_regs *)addr; - *ip = regs->ip; - *sp = regs->sp; + /* x86-32 support will be more complicated due to the ®s->sp hack */ + BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32)); - return true; - } - - if (!stack_access_ok(state, addr, sp_offset)) + if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) return false; *ip = regs->ip; + *sp = regs->sp; + return true; +} - if (user_mode(regs)) { - if (!stack_access_ok(state, addr + sp_offset, - REGS_SIZE - SP_OFFSET)) - return false; +static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr, + unsigned long *ip, unsigned long *sp) +{ + struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET; - *sp = regs->sp; - } else - *sp = (unsigned long)®s->sp; + if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) + return false; + *ip = regs->ip; + *sp = regs->sp; return true; } @@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state) unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; enum stack_type prev_type = state->stack_info.type; struct orc_entry *orc; - struct pt_regs *ptregs; bool indirect = false; if (unwind_done(state)) @@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_TYPE_REGS: - if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { + if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); goto done; @@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_TYPE_REGS_IRET: - if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { + if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference iret registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); goto done; } - ptregs = container_of((void *)sp, struct pt_regs, ip); - if ((unsigned long)ptregs >= prev_sp && - on_stack(&state->stack_info, ptregs, REGS_SIZE)) { - state->regs = ptregs; - state->full_regs = false; - } else - state->regs = NULL; - + state->regs = (void *)sp - IRET_FRAME_OFFSET; + state->full_regs = false; state->signal = true; break; @@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, } if (get_stack_info((unsigned long *)state->sp, state->task, - &state->stack_info, &state->stack_mask)) - return; + &state->stack_info, &state->stack_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp); + if (get_stack_info(next_page, state->task, &state->stack_info, + &state->stack_mask)) + return; + } /* * The caller can provide the address of the first frame directly diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a4009fb9be87..d2a8b5a24a44 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -107,6 +107,15 @@ SECTIONS SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) + +#ifdef CONFIG_X86_64 + . = ALIGN(PAGE_SIZE); + _entry_trampoline = .; + *(.entry_trampoline) + . = ALIGN(PAGE_SIZE); + ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); +#endif + /* End of text section */ _etext = .; } :text = 0x9090 |