diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/Makefile | 3 | ||||
-rw-r--r-- | arch/x86/kernel/apic/x2apic_uv_x.c | 7 | ||||
-rw-r--r-- | arch/x86/kernel/apm_32.c | 16 | ||||
-rw-r--r-- | arch/x86/kernel/asm-offsets_32.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/asm-offsets_64.c | 19 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 24 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 18 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce_amd.c | 18 | ||||
-rw-r--r-- | arch/x86/kernel/e820.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/entry_32.S | 47 | ||||
-rw-r--r-- | arch/x86/kernel/entry_64.S | 232 | ||||
-rw-r--r-- | arch/x86/kernel/head_64.S | 4 | ||||
-rw-r--r-- | arch/x86/kernel/nmi.c | 102 | ||||
-rw-r--r-- | arch/x86/kernel/ptrace.c | 25 | ||||
-rw-r--r-- | arch/x86/kernel/syscall_32.c | 25 | ||||
-rw-r--r-- | arch/x86/kernel/syscall_64.c | 20 | ||||
-rw-r--r-- | arch/x86/kernel/syscall_table_32.S | 350 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 20 | ||||
-rw-r--r-- | arch/x86/kernel/tsc.c | 14 | ||||
-rw-r--r-- | arch/x86/kernel/vm86_32.c | 6 |
20 files changed, 478 insertions, 484 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 02b2f05b371e..5369059c07a9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o +obj-y += syscall_$(BITS).o +obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 9d59bbacd4e3..79b05b88aa19 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -769,7 +769,12 @@ void __init uv_system_init(void) for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) uv_possible_blades += hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); - printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); + + /* uv_num_possible_blades() is really the hub count */ + printk(KERN_INFO "UV: Found %d blades, %d hubs\n", + is_uv1_hub() ? uv_num_possible_blades() : + (uv_num_possible_blades() + 1) / 2, + uv_num_possible_blades()); bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = kzalloc(bytes, GFP_KERNEL); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index a46bd383953c..f76623cbe263 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -383,21 +383,21 @@ static int ignore_sys_suspend; static int ignore_normal_resume; static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; -static int debug __read_mostly; -static int smp __read_mostly; +static bool debug __read_mostly; +static bool smp __read_mostly; static int apm_disabled = -1; #ifdef CONFIG_SMP -static int power_off; +static bool power_off; #else -static int power_off = 1; +static bool power_off = 1; #endif -static int realmode_power_off; +static bool realmode_power_off; #ifdef CONFIG_APM_ALLOW_INTS -static int allow_ints = 1; +static bool allow_ints = 1; #else -static int allow_ints; +static bool allow_ints; #endif -static int broken_psr; +static bool broken_psr; static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 395a10e68067..85d98ab15cdc 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -3,6 +3,11 @@ #include <linux/lguest.h> #include "../../../drivers/lguest/lg.h" +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls[] = { +#include <asm/syscalls_32.h> +}; + /* workaround for a warning with -Wmissing-prototypes */ void foo(void); @@ -76,4 +81,7 @@ void foo(void) OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); #endif + BLANK(); + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(NR_syscalls, sizeof(syscalls)); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72a1194af22..834e897b1e25 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,11 +1,12 @@ #include <asm/ia32.h> -#define __NO_STUBS 1 -#undef __SYSCALL -#undef _ASM_X86_UNISTD_64_H -#define __SYSCALL(nr, sym) [nr] = 1, -static char syscalls[] = { -#include <asm/unistd.h> +#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +static char syscalls_64[] = { +#include <asm/syscalls_64.h> +}; +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls_ia32[] = { +#include <asm/syscalls_32.h> }; int main(void) @@ -72,7 +73,11 @@ int main(void) OFFSET(TSS_ist, tss_struct, x86_tss.ist); BLANK(); - DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); + DEFINE(NR_syscalls, sizeof(syscalls_64)); + + DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1); + DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); return 0; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 850f2963a420..d43cad74f166 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1021,6 +1021,8 @@ __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) nmi_idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); @@ -1085,6 +1087,26 @@ unsigned long kernel_eflags; */ DEFINE_PER_CPU(struct orig_ist, orig_ist); +static DEFINE_PER_CPU(unsigned long, debug_stack_addr); +DEFINE_PER_CPU(int, debug_stack_usage); + +int is_debug_stack(unsigned long addr) +{ + return __get_cpu_var(debug_stack_usage) || + (addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); +} + +void debug_stack_set_zero(void) +{ + load_idt((const struct desc_ptr *)&nmi_idt_descr); +} + +void debug_stack_reset(void) +{ + load_idt((const struct desc_ptr *)&idt_descr); +} + #else /* CONFIG_X86_64 */ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; @@ -1212,6 +1234,8 @@ void __cpuinit cpu_init(void) estacks += exception_stack_sizes[v]; oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; + if (v == DEBUG_STACK-1) + per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; } } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f22a9f7f6390..5a11ae2e9e91 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1859,7 +1859,7 @@ static struct bus_type mce_subsys = { .dev_name = "machinecheck", }; -DEFINE_PER_CPU(struct device, mce_device); +struct device *mce_device[CONFIG_NR_CPUS]; __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -2001,19 +2001,27 @@ static struct device_attribute *mce_device_attrs[] = { static cpumask_var_t mce_device_initialized; +static void mce_device_release(struct device *dev) +{ + kfree(dev); +} + /* Per cpu device init. All of the cpus still share the same ctrl bank: */ static __cpuinit int mce_device_create(unsigned int cpu) { - struct device *dev = &per_cpu(mce_device, cpu); + struct device *dev; int err; int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&dev->kobj, 0, sizeof(struct kobject)); + dev = kzalloc(sizeof *dev, GFP_KERNEL); + if (!dev) + return -ENOMEM; dev->id = cpu; dev->bus = &mce_subsys; + dev->release = &mce_device_release; err = device_register(dev); if (err) @@ -2030,6 +2038,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) goto error2; } cpumask_set_cpu(cpu, mce_device_initialized); + mce_device[cpu] = dev; return 0; error2: @@ -2046,7 +2055,7 @@ error: static __cpuinit void mce_device_remove(unsigned int cpu) { - struct device *dev = &per_cpu(mce_device, cpu); + struct device *dev = mce_device[cpu]; int i; if (!cpumask_test_cpu(cpu, mce_device_initialized)) @@ -2060,6 +2069,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) device_unregister(dev); cpumask_clear_cpu(cpu, mce_device_initialized); + mce_device[cpu] = NULL; } /* Make sure there are no machine checks on offlined CPUs. */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index ba0b94a7e204..786e76a86322 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -523,6 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; + struct device *dev = mce_device[cpu]; char name[32]; sprintf(name, "threshold_bank%i", bank); @@ -543,8 +544,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj, - b->kobj, name); + err = sysfs_create_link(&dev->kobj, b->kobj, name); if (err) goto out; @@ -565,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj); + b->kobj = kobject_create_and_add(name, &dev->kobj); if (!b->kobj) goto out_free; @@ -585,8 +585,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(mce_device, i).kobj, - b->kobj, name); + dev = mce_device[i]; + if (dev) + err = sysfs_create_link(&dev->kobj,b->kobj, name); if (err) goto out; @@ -649,6 +650,7 @@ static void deallocate_threshold_block(unsigned int cpu, static void threshold_remove_bank(unsigned int cpu, int bank) { struct threshold_bank *b; + struct device *dev; char name[32]; int i = 0; @@ -663,7 +665,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name); + sysfs_remove_link(&mce_device[cpu]->kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -675,7 +677,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(mce_device, i).kobj, name); + dev = mce_device[i]; + if (dev) + sysfs_remove_link(&dev->kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 174d938d576b..62d61e9976eb 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -703,7 +703,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) } #endif -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_ACPI /** * Mark ACPI NVS memory region, so that we can save/restore it during * hibernation and the subsequent resume. @@ -716,7 +716,7 @@ static int __init e820_mark_nvs_memory(void) struct e820entry *ei = &e820.map[i]; if (ei->type == E820_NVS) - suspend_nvs_register(ei->addr, ei->size); + acpi_nvs_register(ei->addr, ei->size); } return 0; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 22d0e21b4dd7..79d97e68f042 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -42,6 +42,7 @@ */ #include <linux/linkage.h> +#include <linux/err.h> #include <asm/thread_info.h> #include <asm/irqflags.h> #include <asm/errno.h> @@ -81,8 +82,6 @@ * enough to patch inline, increasing performance. */ -#define nr_syscalls ((syscall_table_size)/4) - #ifdef CONFIG_PREEMPT #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else @@ -423,7 +422,7 @@ sysenter_past_esp: testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz sysenter_audit sysenter_do_call: - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) @@ -455,7 +454,7 @@ sysenter_audit: movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ movl %eax,%edx /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call @@ -466,11 +465,10 @@ sysexit_audit: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) movl %eax,%edx /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%eax /* zero-extend that */ - inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx @@ -504,7 +502,7 @@ ENTRY(system_call) # system call tracing in operation / emulation testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jae syscall_badsys syscall_call: call *sys_call_table(,%eax,4) @@ -654,7 +652,7 @@ syscall_trace_entry: movl %esp, %eax call syscall_trace_enter /* What it returned is what we'll actually use. */ - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jnae syscall_call jmp syscall_exit END(syscall_trace_entry) @@ -694,29 +692,28 @@ END(syscall_badsys) * System calls that need a pt_regs pointer. */ #define PTREGSCALL0(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL1(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%edx; \ movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL2(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%ecx; \ movl (PT_ECX+4)(%esp),%edx; \ movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL3(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ CFI_STARTPROC; \ leal 4(%esp),%eax; \ pushl_cfi %eax; \ @@ -741,8 +738,7 @@ PTREGSCALL2(vm86) PTREGSCALL1(vm86old) /* Clone is an oddball. The 4th arg is in %edi */ - ALIGN; -ptregs_clone: +ENTRY(ptregs_clone) CFI_STARTPROC leal 4(%esp),%eax pushl_cfi %eax @@ -1213,11 +1209,6 @@ return_to_handler: jmp *%ecx #endif -.section .rodata,"a" -#include "syscall_table_32.S" - -syscall_table_size=(.-sys_call_table) - /* * Some functions should be protected against kprobes */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a20e1cb9dc87..3fe8239fd8fb 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,7 @@ #include <asm/paravirt.h> #include <asm/ftrace.h> #include <asm/percpu.h> +#include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -548,7 +549,7 @@ badsys: #ifdef CONFIG_AUDITSYSCALL /* * Fast path for syscall audit without full syscall trace. - * We just call audit_syscall_entry() directly, and then + * We just call __audit_syscall_entry() directly, and then * jump back to the normal fast path. */ auditsys: @@ -558,22 +559,21 @@ auditsys: movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ movq %rax,%rsi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry LOAD_ARGS 0 /* reload call-clobbered registers */ jmp system_call_fastpath /* - * Return fast path for syscall audit. Call audit_syscall_exit() + * Return fast path for syscall audit. Call __audit_syscall_exit() * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT * masked off. */ sysret_audit: movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ - cmpq $0,%rsi /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi jmp sysret_check #endif /* CONFIG_AUDITSYSCALL */ @@ -1480,62 +1480,214 @@ ENTRY(error_exit) CFI_ENDPROC END(error_exit) +/* + * Test if a given stack is an NMI stack or not. + */ + .macro test_in_nmi reg stack nmi_ret normal_ret + cmpq %\reg, \stack + ja \normal_ret + subq $EXCEPTION_STKSZ, %\reg + cmpq %\reg, \stack + jb \normal_ret + jmp \nmi_ret + .endm /* runs on exception stack */ ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. + * This means that we can have nested NMIs where the next + * NMI is using the top of the stack of the previous NMI. We + * can't let it execute because the nested NMI will corrupt the + * stack of the previous NMI. NMI handlers are not re-entrant + * anyway. + * + * To handle this case we do the following: + * Check the a special location on the stack that contains + * a variable that is set when NMIs are executing. + * The interrupted task's stack is also checked to see if it + * is an NMI stack. + * If the variable is not set and the stack is not the NMI + * stack then: + * o Set the special variable on the stack + * o Copy the interrupt frame into a "saved" location on the stack + * o Copy the interrupt frame into a "copy" location on the stack + * o Continue processing the NMI + * If the variable is set or the previous stack is the NMI stack: + * o Modify the "copy" location to jump to the repeate_nmi + * o return back to the first NMI + * + * Now on exit of the first NMI, we first clear the stack variable + * The NMI stack will tell any nested NMIs at that point that it is + * nested. Then we pop the stack normally with iret, and if there was + * a nested NMI that updated the copy interrupt stack frame, a + * jump will be made to the repeat_nmi code that will handle the second + * NMI. + */ + + /* Use %rdx as out temp variable throughout */ + pushq_cfi %rdx + + /* + * Check the special variable on the stack to see if NMIs are + * executing. + */ + cmp $1, -8(%rsp) + je nested_nmi + + /* + * Now test if the previous stack was an NMI stack. + * We need the double check. We check the NMI stack to satisfy the + * race when the first NMI clears the variable before returning. + * We check the variable because the first NMI could be in a + * breakpoint routine using a breakpoint stack. + */ + lea 6*8(%rsp), %rdx + test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + +nested_nmi: + /* + * Do nothing if we interrupted the fixup in repeat_nmi. + * It's about to repeat the NMI handler, so we are fine + * with ignoring this one. + */ + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out + +1: + /* Set up the interrupted NMIs stack to jump to repeat_nmi */ + leaq -6*8(%rsp), %rdx + movq %rdx, %rsp + CFI_ADJUST_CFA_OFFSET 6*8 + pushq_cfi $__KERNEL_DS + pushq_cfi %rdx + pushfq_cfi + pushq_cfi $__KERNEL_CS + pushq_cfi $repeat_nmi + + /* Put stack back */ + addq $(11*8), %rsp + CFI_ADJUST_CFA_OFFSET -11*8 + +nested_nmi_out: + popq_cfi %rdx + + /* No need to check faults here */ + INTERRUPT_RETURN + +first_nmi: + /* + * Because nested NMIs will use the pushed location that we + * stored in rdx, we must keep that space available. + * Here's what our stack frame will look like: + * +-------------------------+ + * | original SS | + * | original Return RSP | + * | original RFLAGS | + * | original CS | + * | original RIP | + * +-------------------------+ + * | temp storage for rdx | + * +-------------------------+ + * | NMI executing variable | + * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ + * | copied SS | + * | copied Return RSP | + * | copied RFLAGS | + * | copied CS | + * | copied RIP | + * +-------------------------+ + * | pt_regs | + * +-------------------------+ + * + * The saved RIP is used to fix up the copied RIP that a nested + * NMI may zero out. The original stack frame and the temp storage + * is also used by nested NMIs and can not be trusted on exit. + */ + /* Set the NMI executing variable on the stack. */ + pushq_cfi $1 + + /* Copy the stack frame to the Saved frame */ + .rept 5 + pushq_cfi 6*8(%rsp) + .endr + + /* Make another copy, this one may be modified by nested NMIs */ + .rept 5 + pushq_cfi 4*8(%rsp) + .endr + + /* Do not pop rdx, nested NMIs will corrupt it */ + movq 11*8(%rsp), %rdx + + /* + * Everything below this point can be preempted by a nested + * NMI if the first NMI took an exception. Repeated NMIs + * caused by an exception and nested NMI will start here, and + * can still be preempted by another NMI. + */ +restart_nmi: + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + /* + * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit + * as we should not be calling schedule in NMI context. + * Even with normal interrupts enabled. An NMI should not be + * setting NEED_RESCHED or anything that normal interrupts and + * exceptions might do. + */ call save_paranoid DEFAULT_FRAME 0 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp,%rdi movq $-1,%rsi call do_nmi -#ifdef CONFIG_TRACE_IRQFLAGS - /* paranoidexit; without TRACE_IRQS_OFF */ - /* ebx: no swapgs flag */ - DISABLE_INTERRUPTS(CLBR_NONE) testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore - testl $3,CS(%rsp) - jnz nmi_userspace nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 + /* Clear the NMI executing stack variable */ + movq $0, 10*8(%rsp) jmp irq_return -nmi_userspace: - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz nmi_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz nmi_schedule - movl %ebx,%edx /* arg3: thread flags */ - ENABLE_INTERRUPTS(CLBR_NONE) - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - DISABLE_INTERRUPTS(CLBR_NONE) - jmp nmi_userspace -nmi_schedule: - ENABLE_INTERRUPTS(CLBR_ANY) - call schedule - DISABLE_INTERRUPTS(CLBR_ANY) - jmp nmi_userspace - CFI_ENDPROC -#else - jmp paranoid_exit CFI_ENDPROC -#endif END(nmi) + /* + * If an NMI hit an iret because of an exception or breakpoint, + * it can lose its NMI context, and a nested NMI may come in. + * In that case, the nested NMI will change the preempted NMI's + * stack to jump to here when it does the final iret. + */ +repeat_nmi: + INTR_FRAME + /* Update the stack variable to say we are still in NMI */ + movq $1, 5*8(%rsp) + + /* copy the saved stack back to copy stack */ + .rept 5 + pushq_cfi 4*8(%rsp) + .endr + + jmp restart_nmi + CFI_ENDPROC +end_repeat_nmi: + ENTRY(ignore_sysret) CFI_STARTPROC mov $-ENOSYS,%eax diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e11e39478a49..40f4eb3766d1 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -417,6 +417,10 @@ ENTRY(phys_base) ENTRY(idt_table) .skip IDT_ENTRIES * 16 + .align L1_CACHE_BYTES +ENTRY(nmi_idt_table) + .skip IDT_ENTRIES * 16 + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e88f37b58ddd..47acaf319165 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -405,9 +405,108 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) unknown_nmi_error(reason, regs); } +/* + * NMIs can hit breakpoints which will cause it to lose its + * NMI context with the CPU when the breakpoint does an iret. + */ +#ifdef CONFIG_X86_32 +/* + * For i386, NMIs use the same stack as the kernel, and we can + * add a workaround to the iret problem in C. Simply have 3 states + * the NMI can be in. + * + * 1) not running + * 2) executing + * 3) latched + * + * When no NMI is in progress, it is in the "not running" state. + * When an NMI comes in, it goes into the "executing" state. + * Normally, if another NMI is triggered, it does not interrupt + * the running NMI and the HW will simply latch it so that when + * the first NMI finishes, it will restart the second NMI. + * (Note, the latch is binary, thus multiple NMIs triggering, + * when one is running, are ignored. Only one NMI is restarted.) + * + * If an NMI hits a breakpoint that executes an iret, another + * NMI can preempt it. We do not want to allow this new NMI + * to run, but we want to execute it when the first one finishes. + * We set the state to "latched", and the first NMI will perform + * an cmpxchg on the state, and if it doesn't successfully + * reset the state to "not running" it will restart the next + * NMI. + */ +enum nmi_states { + NMI_NOT_RUNNING, + NMI_EXECUTING, + NMI_LATCHED, +}; +static DEFINE_PER_CPU(enum nmi_states, nmi_state); + +#define nmi_nesting_preprocess(regs) \ + do { \ + if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ + __get_cpu_var(nmi_state) = NMI_LATCHED; \ + return; \ + } \ + nmi_restart: \ + __get_cpu_var(nmi_state) = NMI_EXECUTING; \ + } while (0) + +#define nmi_nesting_postprocess() \ + do { \ + if (cmpxchg(&__get_cpu_var(nmi_state), \ + NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ + goto nmi_restart; \ + } while (0) +#else /* x86_64 */ +/* + * In x86_64 things are a bit more difficult. This has the same problem + * where an NMI hitting a breakpoint that calls iret will remove the + * NMI context, allowing a nested NMI to enter. What makes this more + * difficult is that both NMIs and breakpoints have their own stack. + * When a new NMI or breakpoint is executed, the stack is set to a fixed + * point. If an NMI is nested, it will have its stack set at that same + * fixed address that the first NMI had, and will start corrupting the + * stack. This is handled in entry_64.S, but the same problem exists with + * the breakpoint stack. + * + * If a breakpoint is being processed, and the debug stack is being used, + * if an NMI comes in and also hits a breakpoint, the stack pointer + * will be set to the same fixed address as the breakpoint that was + * interrupted, causing that stack to be corrupted. To handle this case, + * check if the stack that was interrupted is the debug stack, and if + * so, change the IDT so that new breakpoints will use the current stack + * and not switch to the fixed address. On return of the NMI, switch back + * to the original IDT. + */ +static DEFINE_PER_CPU(int, update_debug_stack); + +static inline void nmi_nesting_preprocess(struct pt_regs *regs) +{ + /* + * If we interrupted a breakpoint, it is possible that + * the nmi handler will have breakpoints too. We need to + * change the IDT such that breakpoints that happen here + * continue to use the NMI stack. + */ + if (unlikely(is_debug_stack(regs->sp))) { + debug_stack_set_zero(); + __get_cpu_var(update_debug_stack) = 1; + } +} + +static inline void nmi_nesting_postprocess(void) +{ + if (unlikely(__get_cpu_var(update_debug_stack))) + debug_stack_reset(); +} +#endif + dotraplinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { + nmi_nesting_preprocess(regs); + nmi_enter(); inc_irq_stat(__nmi_count); @@ -416,6 +515,9 @@ do_nmi(struct pt_regs *regs, long error_code) default_do_nmi(regs); nmi_exit(); + + /* On i386, may loop back to preprocess */ + nmi_nesting_postprocess(); } void stop_nmi(void) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 89a04c7b5bb6..50267386b766 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); - if (unlikely(current->audit_context)) { - if (IS_IA32) - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_ax, - regs->bx, regs->cx, - regs->dx, regs->si); + if (IS_IA32) + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_ax, + regs->bx, regs->cx, + regs->dx, regs->si); #ifdef CONFIG_X86_64 - else - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_ax, - regs->di, regs->si, - regs->dx, regs->r10); + else + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_ax, + regs->di, regs->si, + regs->dx, regs->r10); #endif - } return ret ?: regs->orig_ax; } @@ -1414,8 +1412,7 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->ax); diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c new file mode 100644 index 000000000000..147fcd4941c4 --- /dev/null +++ b/arch/x86/kernel/syscall_32.c @@ -0,0 +1,25 @@ +/* System call table for i386. */ + +#include <linux/linkage.h> +#include <linux/sys.h> +#include <linux/cache.h> +#include <asm/asm-offsets.h> + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#include <asm/syscalls_32.h> +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, + +typedef asmlinkage void (*sys_call_ptr_t)(void); + +extern asmlinkage void sys_ni_syscall(void); + +const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include <asm/syscalls_32.h> +}; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index de87d6008295..7ac7943be02c 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -5,15 +5,11 @@ #include <linux/cache.h> #include <asm/asm-offsets.h> -#define __NO_STUBS +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include <asm/syscalls_64.h> +#undef __SYSCALL_64 -#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_UNISTD_64_H -#include <asm/unistd_64.h> - -#undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = sym, -#undef _ASM_X86_UNISTD_64_H +#define __SYSCALL_64(nr, sym, compat) [nr] = sym, typedef void (*sys_call_ptr_t)(void); @@ -21,9 +17,9 @@ extern void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* - *Smells like a like a compiler bug -- it doesn't work - *when the & below is removed. - */ + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ [0 ... __NR_syscall_max] = &sys_ni_syscall, -#include <asm/unistd_64.h> +#include <asm/syscalls_64.h> }; diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S deleted file mode 100644 index 9a0e31293920..000000000000 --- a/arch/x86/kernel/syscall_table_32.S +++ /dev/null @@ -1,350 +0,0 @@ -ENTRY(sys_call_table) - .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ - .long sys_exit - .long ptregs_fork - .long sys_read - .long sys_write - .long sys_open /* 5 */ - .long sys_close - .long sys_waitpid - .long sys_creat - .long sys_link - .long sys_unlink /* 10 */ - .long ptregs_execve - .long sys_chdir - .long sys_time - .long sys_mknod - .long sys_chmod /* 15 */ - .long sys_lchown16 - .long sys_ni_syscall /* old break syscall holder */ - .long sys_stat - .long sys_lseek - .long sys_getpid /* 20 */ - .long sys_mount - .long sys_oldumount - .long sys_setuid16 - .long sys_getuid16 - .long sys_stime /* 25 */ - .long sys_ptrace - .long sys_alarm - .long sys_fstat - .long sys_pause - .long sys_utime /* 30 */ - .long sys_ni_syscall /* old stty syscall holder */ - .long sys_ni_syscall /* old gtty syscall holder */ - .long sys_access - .long sys_nice - .long sys_ni_syscall /* 35 - old ftime syscall holder */ - .long sys_sync - .long sys_kill - .long sys_rename - .long sys_mkdir - .long sys_rmdir /* 40 */ - .long sys_dup - .long sys_pipe - .long sys_times - .long sys_ni_syscall /* old prof syscall holder */ - .long sys_brk /* 45 */ - .long sys_setgid16 - .long sys_getgid16 - .long sys_signal - .long sys_geteuid16 - .long sys_getegid16 /* 50 */ - .long sys_acct - .long sys_umount /* recycled never used phys() */ - .long sys_ni_syscall /* old lock syscall holder */ - .long sys_ioctl - .long sys_fcntl /* 55 */ - .long sys_ni_syscall /* old mpx syscall holder */ - .long sys_setpgid - .long sys_ni_syscall /* old ulimit syscall holder */ - .long sys_olduname - .long sys_umask /* 60 */ - .long sys_chroot - .long sys_ustat - .long sys_dup2 - .long sys_getppid - .long sys_getpgrp /* 65 */ - .long sys_setsid - .long sys_sigaction - .long sys_sgetmask - .long sys_ssetmask - .long sys_setreuid16 /* 70 */ - .long sys_setregid16 - .long sys_sigsuspend - .long sys_sigpending - .long sys_sethostname - .long sys_setrlimit /* 75 */ - .long sys_old_getrlimit - .long sys_getrusage - .long sys_gettimeofday - .long sys_settimeofday - .long sys_getgroups16 /* 80 */ - .long sys_setgroups16 - .long sys_old_select - .long sys_symlink - .long sys_lstat - .long sys_readlink /* 85 */ - .long sys_uselib - .long sys_swapon - .long sys_reboot - .long sys_old_readdir - .long sys_old_mmap /* 90 */ - .long sys_munmap - .long sys_truncate - .long sys_ftruncate - .long sys_fchmod - .long sys_fchown16 /* 95 */ - .long sys_getpriority - .long sys_setpriority - .long sys_ni_syscall /* old profil syscall holder */ - .long sys_statfs - .long sys_fstatfs /* 100 */ - .long sys_ioperm - .long sys_socketcall - .long sys_syslog - .long sys_setitimer - .long sys_getitimer /* 105 */ - .long sys_newstat - .long sys_newlstat - .long sys_newfstat - .long sys_uname - .long ptregs_iopl /* 110 */ - .long sys_vhangup - .long sys_ni_syscall /* old "idle" system call */ - .long ptregs_vm86old - .long sys_wait4 - .long sys_swapoff /* 115 */ - .long sys_sysinfo - .long sys_ipc - .long sys_fsync - .long ptregs_sigreturn - .long ptregs_clone /* 120 */ - .long sys_setdomainname - .long sys_newuname - .long sys_modify_ldt - .long sys_adjtimex - .long sys_mprotect /* 125 */ - .long sys_sigprocmask - .long sys_ni_syscall /* old "create_module" */ - .long sys_init_module - .long sys_delete_module - .long sys_ni_syscall /* 130: old "get_kernel_syms" */ - .long sys_quotactl - .long sys_getpgid - .long sys_fchdir - .long sys_bdflush - .long sys_sysfs /* 135 */ - .long sys_personality - .long sys_ni_syscall /* reserved for afs_syscall */ - .long sys_setfsuid16 - .long sys_setfsgid16 - .long sys_llseek /* 140 */ - .long sys_getdents - .long sys_select - .long sys_flock - .long sys_msync - .long sys_readv /* 145 */ - .long sys_writev - .long sys_getsid - .long sys_fdatasync - .long sys_sysctl - .long sys_mlock /* 150 */ - .long sys_munlock - .long sys_mlockall - .long sys_munlockall - .long sys_sched_setparam - .long sys_sched_getparam /* 155 */ - .long sys_sched_setscheduler - .long sys_sched_getscheduler - .long sys_sched_yield - .long sys_sched_get_priority_max - .long sys_sched_get_priority_min /* 160 */ - .long sys_sched_rr_get_interval - .long sys_nanosleep - .long sys_mremap - .long sys_setresuid16 - .long sys_getresuid16 /* 165 */ - .long ptregs_vm86 - .long sys_ni_syscall /* Old sys_query_module */ - .long sys_poll - .long sys_ni_syscall /* Old nfsservctl */ - .long sys_setresgid16 /* 170 */ - .long sys_getresgid16 - .long sys_prctl - .long ptregs_rt_sigreturn - .long sys_rt_sigaction - .long sys_rt_sigprocmask /* 175 */ - .long sys_rt_sigpending - .long sys_rt_sigtimedwait - .long sys_rt_sigqueueinfo - .long sys_rt_sigsuspend - .long sys_pread64 /* 180 */ - .long sys_pwrite64 - .long sys_chown16 - .long sys_getcwd - .long sys_capget - .long sys_capset /* 185 */ - .long ptregs_sigaltstack - .long sys_sendfile - .long sys_ni_syscall /* reserved for streams1 */ - .long sys_ni_syscall /* reserved for streams2 */ - .long ptregs_vfork /* 190 */ - .long sys_getrlimit - .long sys_mmap_pgoff - .long sys_truncate64 - .long sys_ftruncate64 - .long sys_stat64 /* 195 */ - .long sys_lstat64 - .long sys_fstat64 - .long sys_lchown - .long sys_getuid - .long sys_getgid /* 200 */ - .long sys_geteuid - .long sys_getegid - .long sys_setreuid - .long sys_setregid - .long sys_getgroups /* 205 */ - .long sys_setgroups - .long sys_fchown - .long sys_setresuid - .long sys_getresuid - .long sys_setresgid /* 210 */ - .long sys_getresgid - .long sys_chown - .long sys_setuid - .long sys_setgid - .long sys_setfsuid /* 215 */ - .long sys_setfsgid - .long sys_pivot_root - .long sys_mincore - .long sys_madvise - .long sys_getdents64 /* 220 */ - .long sys_fcntl64 - .long sys_ni_syscall /* reserved for TUX */ - .long sys_ni_syscall - .long sys_gettid - .long sys_readahead /* 225 */ - .long sys_setxattr - .long sys_lsetxattr - .long sys_fsetxattr - .long sys_getxattr - .long sys_lgetxattr /* 230 */ - .long sys_fgetxattr - .long sys_listxattr - .long sys_llistxattr - .long sys_flistxattr - .long sys_removexattr /* 235 */ - .long sys_lremovexattr - .long sys_fremovexattr - .long sys_tkill - .long sys_sendfile64 - .long sys_futex /* 240 */ - .long sys_sched_setaffinity - .long sys_sched_getaffinity - .long sys_set_thread_area - .long sys_get_thread_area - .long sys_io_setup /* 245 */ - .long sys_io_destroy - .long sys_io_getevents - .long sys_io_submit - .long sys_io_cancel - .long sys_fadvise64 /* 250 */ - .long sys_ni_syscall - .long sys_exit_group - .long sys_lookup_dcookie - .long sys_epoll_create - .long sys_epoll_ctl /* 255 */ - .long sys_epoll_wait - .long sys_remap_file_pages - .long sys_set_tid_address - .long sys_timer_create - .long sys_timer_settime /* 260 */ - .long sys_timer_gettime - .long sys_timer_getoverrun - .long sys_timer_delete - .long sys_clock_settime - .long sys_clock_gettime /* 265 */ - .long sys_clock_getres - .long sys_clock_nanosleep - .long sys_statfs64 - .long sys_fstatfs64 - .long sys_tgkill /* 270 */ - .long sys_utimes - .long sys_fadvise64_64 - .long sys_ni_syscall /* sys_vserver */ - .long sys_mbind - .long sys_get_mempolicy - .long sys_set_mempolicy - .long sys_mq_open - .long sys_mq_unlink - .long sys_mq_timedsend - .long sys_mq_timedreceive /* 280 */ - .long sys_mq_notify - .long sys_mq_getsetattr - .long sys_kexec_load - .long sys_waitid - .long sys_ni_syscall /* 285 */ /* available */ - .long sys_add_key - .long sys_request_key - .long sys_keyctl - .long sys_ioprio_set - .long sys_ioprio_get /* 290 */ - .long sys_inotify_init - .long sys_inotify_add_watch - .long sys_inotify_rm_watch - .long sys_migrate_pages - .long sys_openat /* 295 */ - .long sys_mkdirat - .long sys_mknodat - .long sys_fchownat - .long sys_futimesat - .long sys_fstatat64 /* 300 */ - .long sys_unlinkat - .long sys_renameat - .long sys_linkat - .long sys_symlinkat - .long sys_readlinkat /* 305 */ - .long sys_fchmodat - .long sys_faccessat - .long sys_pselect6 - .long sys_ppoll - .long sys_unshare /* 310 */ - .long sys_set_robust_list - .long sys_get_robust_list - .long sys_splice - .long sys_sync_file_range - .long sys_tee /* 315 */ - .long sys_vmsplice - .long sys_move_pages - .long sys_getcpu - .long sys_epoll_pwait - .long sys_utimensat /* 320 */ - .long sys_signalfd - .long sys_timerfd_create - .long sys_eventfd - .long sys_fallocate - .long sys_timerfd_settime /* 325 */ - .long sys_timerfd_gettime - .long sys_signalfd4 - .long sys_eventfd2 - .long sys_epoll_create1 - .long sys_dup3 /* 330 */ - .long sys_pipe2 - .long sys_inotify_init1 - .long sys_preadv - .long sys_pwritev - .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_event_open - .long sys_recvmmsg - .long sys_fanotify_init - .long sys_fanotify_mark - .long sys_prlimit64 /* 340 */ - .long sys_name_to_handle_at - .long sys_open_by_handle_at - .long sys_clock_adjtime - .long sys_syncfs - .long sys_sendmmsg /* 345 */ - .long sys_setns - .long sys_process_vm_readv - .long sys_process_vm_writev diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fa1191fb679d..482ec3af2067 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -311,9 +311,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) == NOTIFY_STOP) return; + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); + debug_stack_usage_dec(); } #ifdef CONFIG_X86_64 @@ -406,6 +412,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) SIGTRAP) == NOTIFY_STOP) return; + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); + /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); @@ -413,6 +425,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } @@ -432,6 +445,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } @@ -718,4 +732,10 @@ void __init trap_init(void) cpu_init(); x86_init.irqs.trap_init(); + +#ifdef CONFIG_X86_64 + memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); + set_nmi_gate(1, &debug); + set_nmi_gate(3, &int3); +#endif } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c0dd5b603749..a62c201c97ec 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -290,14 +290,15 @@ static inline int pit_verify_msb(unsigned char val) static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; - u64 tsc = 0; + u64 tsc = 0, prev_tsc = 0; for (count = 0; count < 50000; count++) { if (!pit_verify_msb(val)) break; + prev_tsc = tsc; tsc = get_cycles(); } - *deltap = get_cycles() - tsc; + *deltap = get_cycles() - prev_tsc; *tscp = tsc; /* @@ -311,9 +312,9 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de * How many MSB values do we want to see? We aim for * a maximum error rate of 500ppm (in practice the * real error is much smaller), but refuse to spend - * more than 25ms on it. + * more than 50ms on it. */ -#define MAX_QUICK_PIT_MS 25 +#define MAX_QUICK_PIT_MS 50 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) static unsigned long quick_pit_calibrate(void) @@ -383,15 +384,12 @@ success: * * As a result, we can depend on there not being * any odd delays anywhere, and the TSC reads are - * reliable (within the error). We also adjust the - * delta to the middle of the error bars, just - * because it looks nicer. + * reliable (within the error). * * kHz = ticks / time-in-seconds / 1000; * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) */ - delta += (long)(d2 - d1)/2; delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); printk("Fast TSC calibration using PIT\n"); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 863f8753ab0a..b466cab5ba15 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -335,9 +335,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - /*call audit_syscall_exit since we do not exit via the normal paths */ + /*call __audit_syscall_exit since we do not exit via the normal paths */ +#ifdef CONFIG_AUDITSYSCALL if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(0), 0); + __audit_syscall_exit(1, 0); +#endif __asm__ __volatile__( "movl %0,%%esp\n\t" |