diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-13 13:16:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-13 13:16:36 -0700 |
commit | 60f898eeaaa1c5d0162a4240bacf33a6c87ecef6 (patch) | |
tree | 23eeac4b1e9a616779d22c104dbc8bd45dfeefd1 /arch/x86/include | |
parent | 977e1ba50893c15121557b39de586901fe3f75cf (diff) | |
parent | 3b75232d55680ca166dffa274d0587d5faf0a016 (diff) |
Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 asm changes from Ingo Molnar:
"There were lots of changes in this development cycle:
- over 100 separate cleanups, restructuring changes, speedups and
fixes in the x86 system call, irq, trap and other entry code, part
of a heroic effort to deobfuscate a decade old spaghetti asm code
and its C code dependencies (Denys Vlasenko, Andy Lutomirski)
- alternatives code fixes and enhancements (Borislav Petkov)
- simplifications and cleanups to the compat code (Brian Gerst)
- signal handling fixes and new x86 testcases (Andy Lutomirski)
- various other fixes and cleanups
By their nature many of these changes are risky - we tried to test
them well on many different x86 systems (there are no known
regressions), and they are split up finely to help bisection - but
there's still a fair bit of residual risk left so caveat emptor"
* 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (148 commits)
perf/x86/64: Report regs_user->ax too in get_regs_user()
perf/x86/64: Simplify regs_user->abi setting code in get_regs_user()
perf/x86/64: Do report user_regs->cx while we are in syscall, in get_regs_user()
perf/x86/64: Do not guess user_regs->cs, ss, sp in get_regs_user()
x86/asm/entry/32: Tidy up JNZ instructions after TESTs
x86/asm/entry/64: Reduce padding in execve stubs
x86/asm/entry/64: Remove GET_THREAD_INFO() in ret_from_fork
x86/asm/entry/64: Simplify jumps in ret_from_fork
x86/asm/entry/64: Remove a redundant jump
x86/asm/entry/64: Optimize [v]fork/clone stubs
x86/asm/entry: Zero EXTRA_REGS for stub32_execve() too
x86/asm/entry/64: Move stub_x32_execvecloser() to stub_execveat()
x86/asm/entry/64: Use common code for rt_sigreturn() epilogue
x86/asm/entry/64: Add forgotten CFI annotation
x86/asm/entry/irq: Simplify interrupt dispatch table (IDT) layout
x86/asm/entry/64: Move opportunistic sysret code to syscall code path
x86, selftests: Add sigreturn selftest
x86/alternatives: Guard NOPs optimization
x86/asm/entry: Clear EXTRA_REGS for all executable formats
x86/signal: Remove pax argument from restore_sigcontext
...
Diffstat (limited to 'arch/x86/include')
28 files changed, 697 insertions, 490 deletions
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 372231c22a47..bdf02eeee765 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -18,12 +18,63 @@ .endm #endif -.macro altinstruction_entry orig alt feature orig_len alt_len +.macro altinstruction_entry orig alt feature orig_len alt_len pad_len .long \orig - . .long \alt - . .word \feature .byte \orig_len .byte \alt_len + .byte \pad_len +.endm + +.macro ALTERNATIVE oldinstr, newinstr, feature +140: + \oldinstr +141: + .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr +144: + .popsection +.endm + +#define old_len 141b-140b +#define new_len1 144f-143f +#define new_len2 145f-144f + +/* + * max without conditionals. Idea adapted from: + * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + */ +#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) + +.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 +140: + \oldinstr +141: + .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ + (alt_max_short(new_len1, new_len2) - (old_len)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr1 +144: + \newinstr2 +145: + .popsection .endm #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 473bdbee378a..ba32af062f61 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -48,8 +48,9 @@ struct alt_instr { s32 repl_offset; /* offset to replacement instruction */ u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ - u8 replacementlen; /* length of new instruction, <= instrlen */ -}; + u8 replacementlen; /* length of new instruction */ + u8 padlen; /* length of build-time padding */ +} __packed; extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); @@ -76,50 +77,69 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ -#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" +#define b_replacement(num) "664"#num +#define e_replacement(num) "665"#num -#define b_replacement(number) "663"#number -#define e_replacement(number) "664"#number +#define alt_end_marker "663" +#define alt_slen "662b-661b" +#define alt_pad_len alt_end_marker"b-662b" +#define alt_total_slen alt_end_marker"b-661b" +#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" -#define alt_slen "662b-661b" -#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" +#define __OLDINSTR(oldinstr, num) \ + "661:\n\t" oldinstr "\n662:\n" \ + ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \ + "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n" -#define ALTINSTR_ENTRY(feature, number) \ +#define OLDINSTR(oldinstr, num) \ + __OLDINSTR(oldinstr, num) \ + alt_end_marker ":\n" + +/* + * max without conditionals. Idea adapted from: + * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas works with s32s. + */ +#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" + +/* + * Pad the second replacement alternative with additional NOPs if it is + * additionally longer than the first replacement alternative. + */ +#define OLDINSTR_2(oldinstr, num1, num2) \ + "661:\n\t" oldinstr "\n662:\n" \ + ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \ + "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \ + alt_end_marker ":\n" + +#define ALTINSTR_ENTRY(feature, num) \ " .long 661b - .\n" /* label */ \ - " .long " b_replacement(number)"f - .\n" /* new instruction */ \ + " .long " b_replacement(num)"f - .\n" /* new instruction */ \ " .word " __stringify(feature) "\n" /* feature bit */ \ - " .byte " alt_slen "\n" /* source len */ \ - " .byte " alt_rlen(number) "\n" /* replacement len */ - -#define DISCARD_ENTRY(number) /* rlen <= slen */ \ - " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" + " .byte " alt_total_slen "\n" /* source len */ \ + " .byte " alt_rlen(num) "\n" /* replacement len */ \ + " .byte " alt_pad_len "\n" /* pad len */ -#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ - b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" +#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \ + b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t" /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ - OLDINSTR(oldinstr) \ + OLDINSTR(oldinstr, 1) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature, 1) \ ".popsection\n" \ - ".pushsection .discard,\"aw\",@progbits\n" \ - DISCARD_ENTRY(1) \ - ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ ".popsection" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ - OLDINSTR(oldinstr) \ + OLDINSTR_2(oldinstr, 1, 2) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature1, 1) \ ALTINSTR_ENTRY(feature2, 2) \ ".popsection\n" \ - ".pushsection .discard,\"aw\",@progbits\n" \ - DISCARD_ENTRY(1) \ - DISCARD_ENTRY(2) \ - ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ @@ -146,6 +166,9 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative(oldinstr, newinstr, feature) \ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") +#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ + asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") + /* * Alternative inline assembly with input. * diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 08f217354442..976b86a325e5 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); - alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, + alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP, ASM_OUTPUT2("=r" (v), "=m" (*addr)), ASM_OUTPUT2("0" (v), "m" (*addr))); } diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 2ab1eb33106e..959e45b81fe2 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -95,13 +95,11 @@ do { \ * Stop RDTSC speculation. This is needed when you need to use RDTSC * (or get_cycles or vread that possibly accesses the TSC) in a defined * code region. - * - * (Could use an alternative three way for this if there was one.) */ static __always_inline void rdtsc_barrier(void) { - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); + alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, + "lfence", X86_FEATURE_LFENCE_RDTSC); } #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 1f1297b46f83..1c8b50edb2db 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -55,143 +55,157 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -#define R15 0 -#define R14 8 -#define R13 16 -#define R12 24 -#define RBP 32 -#define RBX 40 - -/* arguments: interrupts/non tracing syscalls only save up to here: */ -#define R11 48 -#define R10 56 -#define R9 64 -#define R8 72 -#define RAX 80 -#define RCX 88 -#define RDX 96 -#define RSI 104 -#define RDI 112 -#define ORIG_RAX 120 /* + error_code */ -/* end of arguments */ - -/* cpu exception frame or undefined in case of fast syscall: */ -#define RIP 128 -#define CS 136 -#define EFLAGS 144 -#define RSP 152 -#define SS 160 - -#define ARGOFFSET R11 - - .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 - subq $9*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET 9*8+\addskip - movq_cfi rdi, 8*8 - movq_cfi rsi, 7*8 - movq_cfi rdx, 6*8 - - .if \save_rcx - movq_cfi rcx, 5*8 - .endif +/* The layout forms the "struct pt_regs" on the stack: */ +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ +#define R15 0*8 +#define R14 1*8 +#define R13 2*8 +#define R12 3*8 +#define RBP 4*8 +#define RBX 5*8 +/* These regs are callee-clobbered. Always saved on kernel entry. */ +#define R11 6*8 +#define R10 7*8 +#define R9 8*8 +#define R8 9*8 +#define RAX 10*8 +#define RCX 11*8 +#define RDX 12*8 +#define RSI 13*8 +#define RDI 14*8 +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ +#define ORIG_RAX 15*8 +/* Return frame for iretq */ +#define RIP 16*8 +#define CS 17*8 +#define EFLAGS 18*8 +#define RSP 19*8 +#define SS 20*8 + +#define SIZEOF_PTREGS 21*8 + + .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 + subq $15*8+\addskip, %rsp + CFI_ADJUST_CFA_OFFSET 15*8+\addskip + .endm - .if \rax_enosys - movq $-ENOSYS, 4*8(%rsp) - .else - movq_cfi rax, 4*8 + .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 + .if \r11 + movq_cfi r11, 6*8+\offset .endif - - .if \save_r891011 - movq_cfi r8, 3*8 - movq_cfi r9, 2*8 - movq_cfi r10, 1*8 - movq_cfi r11, 0*8 + .if \r8910 + movq_cfi r10, 7*8+\offset + movq_cfi r9, 8*8+\offset + movq_cfi r8, 9*8+\offset + .endif + .if \rax + movq_cfi rax, 10*8+\offset + .endif + .if \rcx + movq_cfi rcx, 11*8+\offset .endif + movq_cfi rdx, 12*8+\offset + movq_cfi rsi, 13*8+\offset + movq_cfi rdi, 14*8+\offset + .endm + .macro SAVE_C_REGS offset=0 + SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 + SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_R891011 + SAVE_C_REGS_HELPER 0, 1, 1, 0, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RCX_R891011 + SAVE_C_REGS_HELPER 0, 1, 0, 0, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11 + SAVE_C_REGS_HELPER 0, 0, 0, 1, 0 + .endm + + .macro SAVE_EXTRA_REGS offset=0 + movq_cfi r15, 0*8+\offset + movq_cfi r14, 1*8+\offset + movq_cfi r13, 2*8+\offset + movq_cfi r12, 3*8+\offset + movq_cfi rbp, 4*8+\offset + movq_cfi rbx, 5*8+\offset + .endm + .macro SAVE_EXTRA_REGS_RBP offset=0 + movq_cfi rbp, 4*8+\offset + .endm + .macro RESTORE_EXTRA_REGS offset=0 + movq_cfi_restore 0*8+\offset, r15 + movq_cfi_restore 1*8+\offset, r14 + movq_cfi_restore 2*8+\offset, r13 + movq_cfi_restore 3*8+\offset, r12 + movq_cfi_restore 4*8+\offset, rbp + movq_cfi_restore 5*8+\offset, rbx .endm -#define ARG_SKIP (9*8) + .macro ZERO_EXTRA_REGS + xorl %r15d, %r15d + xorl %r14d, %r14d + xorl %r13d, %r13d + xorl %r12d, %r12d + xorl %ebp, %ebp + xorl %ebx, %ebx + .endm - .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \ - rstor_r8910=1, rstor_rdx=1 + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 - movq_cfi_restore 0*8, r11 + movq_cfi_restore 6*8, r11 .endif - .if \rstor_r8910 - movq_cfi_restore 1*8, r10 - movq_cfi_restore 2*8, r9 - movq_cfi_restore 3*8, r8 + movq_cfi_restore 7*8, r10 + movq_cfi_restore 8*8, r9 + movq_cfi_restore 9*8, r8 .endif - .if \rstor_rax - movq_cfi_restore 4*8, rax + movq_cfi_restore 10*8, rax .endif - .if \rstor_rcx - movq_cfi_restore 5*8, rcx + movq_cfi_restore 11*8, rcx .endif - .if \rstor_rdx - movq_cfi_restore 6*8, rdx - .endif - - movq_cfi_restore 7*8, rsi - movq_cfi_restore 8*8, rdi - - .if ARG_SKIP+\addskip > 0 - addq $ARG_SKIP+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip) + movq_cfi_restore 12*8, rdx .endif + movq_cfi_restore 13*8, rsi + movq_cfi_restore 14*8, rdi .endm - - .macro LOAD_ARGS offset, skiprax=0 - movq \offset(%rsp), %r11 - movq \offset+8(%rsp), %r10 - movq \offset+16(%rsp), %r9 - movq \offset+24(%rsp), %r8 - movq \offset+40(%rsp), %rcx - movq \offset+48(%rsp), %rdx - movq \offset+56(%rsp), %rsi - movq \offset+64(%rsp), %rdi - .if \skiprax - .else - movq \offset+72(%rsp), %rax - .endif + .macro RESTORE_C_REGS + RESTORE_C_REGS_HELPER 1,1,1,1,1 .endm - -#define REST_SKIP (6*8) - - .macro SAVE_REST - subq $REST_SKIP, %rsp - CFI_ADJUST_CFA_OFFSET REST_SKIP - movq_cfi rbx, 5*8 - movq_cfi rbp, 4*8 - movq_cfi r12, 3*8 - movq_cfi r13, 2*8 - movq_cfi r14, 1*8 - movq_cfi r15, 0*8 + .macro RESTORE_C_REGS_EXCEPT_RAX + RESTORE_C_REGS_HELPER 0,1,1,1,1 .endm - - .macro RESTORE_REST - movq_cfi_restore 0*8, r15 - movq_cfi_restore 1*8, r14 - movq_cfi_restore 2*8, r13 - movq_cfi_restore 3*8, r12 - movq_cfi_restore 4*8, rbp - movq_cfi_restore 5*8, rbx - addq $REST_SKIP, %rsp - CFI_ADJUST_CFA_OFFSET -(REST_SKIP) + .macro RESTORE_C_REGS_EXCEPT_RCX + RESTORE_C_REGS_HELPER 1,0,1,1,1 .endm - - .macro SAVE_ALL - SAVE_ARGS - SAVE_REST + .macro RESTORE_C_REGS_EXCEPT_R11 + RESTORE_C_REGS_HELPER 1,1,0,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_RCX_R11 + RESTORE_C_REGS_HELPER 1,0,0,1,1 + .endm + .macro RESTORE_RSI_RDI + RESTORE_C_REGS_HELPER 0,0,0,0,0 + .endm + .macro RESTORE_RSI_RDI_RDX + RESTORE_C_REGS_HELPER 0,0,0,0,1 .endm - .macro RESTORE_ALL addskip=0 - RESTORE_REST - RESTORE_ARGS 1, \addskip + .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 + addq $15*8+\addskip, %rsp + CFI_ADJUST_CFA_OFFSET -(15*8+\addskip) .endm .macro icebp @@ -210,37 +224,23 @@ For 32-bit we have the following conventions - kernel is built with */ .macro SAVE_ALL - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg eax + pushl_cfi_reg ebp + pushl_cfi_reg edi + pushl_cfi_reg esi + pushl_cfi_reg edx + pushl_cfi_reg ecx + pushl_cfi_reg ebx .endm .macro RESTORE_ALL - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %ecx - CFI_RESTORE ecx - popl_cfi %edx - CFI_RESTORE edx - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi - popl_cfi %ebp - CFI_RESTORE ebp - popl_cfi %eax - CFI_RESTORE eax + popl_cfi_reg ebx + popl_cfi_reg ecx + popl_cfi_reg edx + popl_cfi_reg esi + popl_cfi_reg edi + popl_cfi_reg ebp + popl_cfi_reg eax .endm #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 59c6c401f79f..acdee09228b3 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len) sp = task_pt_regs(current)->sp; } else { /* -128 for the x32 ABI redzone */ - sp = this_cpu_read(old_rsp) - 128; + sp = task_pt_regs(current)->sp - 128; } return (void __user *)round_down(sp - len, 16); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 90a54851aedc..854c04b3c9c2 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -231,7 +231,9 @@ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ @@ -418,6 +420,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P0\n" /* 1: do replace */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (X86_FEATURE_ALWAYS) : : t_warn); @@ -432,6 +435,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P0\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (bit) : : t_no); @@ -457,6 +461,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P1\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ @@ -483,31 +488,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) static __always_inline __pure bool _static_cpu_has_safe(u16 bit) { #ifdef CC_HAVE_ASM_GOTO -/* - * We need to spell the jumps to the compiler because, depending on the offset, - * the replacement jump can be bigger than the original jump, and this we cannot - * have. Thus, we force the jump to the widest, 4-byte, signed relative - * offset even though the last would often fit in less bytes. - */ - asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" + asm_volatile_goto("1: jmp %l[t_dynamic]\n" "2:\n" + ".skip -(((5f-4f) - (2b-1b)) > 0) * " + "((5f-4f) - (2b-1b)),0x90\n" + "3:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" /* src offset */ - " .long 3f - .\n" /* repl offset */ + " .long 4f - .\n" /* repl offset */ " .word %P1\n" /* always replace */ - " .byte 2b - 1b\n" /* src len */ - " .byte 4f - 3f\n" /* repl len */ + " .byte 3b - 1b\n" /* src len */ + " .byte 5f - 4f\n" /* repl len */ + " .byte 3b - 2b\n" /* pad len */ ".previous\n" ".section .altinstr_replacement,\"ax\"\n" - "3: .byte 0xe9\n .long %l[t_no] - 2b\n" - "4:\n" + "4: jmp %l[t_no]\n" + "5:\n" ".previous\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" /* src offset */ " .long 0\n" /* no replacement */ " .word %P0\n" /* feature bit */ - " .byte 2b - 1b\n" /* src len */ + " .byte 3b - 1b\n" /* src len */ " .byte 0\n" /* repl len */ + " .byte 0\n" /* pad len */ ".previous\n" : : "i" (bit), "i" (X86_FEATURE_ALWAYS) : : t_dynamic, t_no); @@ -527,6 +531,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) " .word %P2\n" /* always replace */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ @@ -541,6 +546,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) " .word %P1\n" /* feature bit */ " .byte 4b - 3b\n" /* src len */ " .byte 6f - 5f\n" /* repl len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index a94b82e8f156..a0bf89fd2647 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -376,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr, * Pentium F0 0F bugfix can have resulted in the mapped * IDT being write-protected. */ -#define set_intr_gate(n, addr) \ +#define set_intr_gate_notrace(n, addr) \ do { \ BUG_ON((unsigned)n > 0xFF); \ _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ __KERNEL_CS); \ + } while (0) + +#define set_intr_gate(n, addr) \ + do { \ + set_intr_gate_notrace(n, addr); \ _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ 0, 0, __KERNEL_CS); \ } while (0) diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index f6f15986df6c..de1cdaf4d743 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -86,11 +86,23 @@ CFI_ADJUST_CFA_OFFSET 8 .endm + .macro pushq_cfi_reg reg + pushq %\reg + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET \reg, 0 + .endm + .macro popq_cfi reg popq \reg CFI_ADJUST_CFA_OFFSET -8 .endm + .macro popq_cfi_reg reg + popq %\reg + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE \reg + .endm + .macro pushfq_cfi pushfq CFI_ADJUST_CFA_OFFSET 8 @@ -116,11 +128,23 @@ CFI_ADJUST_CFA_OFFSET 4 .endm + .macro pushl_cfi_reg reg + pushl %\reg + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET \reg, 0 + .endm + .macro popl_cfi reg popl \reg CFI_ADJUST_CFA_OFFSET -4 .endm + .macro popl_cfi_reg reg + popl %\reg + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE \reg + .endm + .macro pushfl_cfi pushfl CFI_ADJUST_CFA_OFFSET 4 diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index ca3347a9dab5..3563107b5060 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -171,10 +171,11 @@ do { \ static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { - regs->ax = regs->bx = regs->cx = regs->dx = 0; - regs->si = regs->di = regs->bp = 0; + /* Commented-out registers are cleared in stub_execve */ + /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0; + regs->si = regs->di /*= regs->bp*/ = 0; regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; - regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; + /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/ t->fs = t->gs = 0; t->fsindex = t->gsindex = 0; t->ds = t->es = ds; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 9662290e0b20..e9571ddabc4f 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -181,10 +181,9 @@ extern __visible void smp_call_function_single_interrupt(struct pt_regs *); extern __visible void smp_invalidate_interrupt(struct pt_regs *); #endif -extern void (*__initconst interrupt[FIRST_SYSTEM_VECTOR - - FIRST_EXTERNAL_VECTOR])(void); +extern char irq_entries_start[]; #ifdef CONFIG_TRACING -#define trace_interrupt interrupt +#define trace_irq_entries_start irq_entries_start #endif #define VECTOR_UNDEFINED (-1) diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 47f29b1d1846..e7814b74caf8 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -69,7 +69,7 @@ struct insn { const insn_byte_t *next_byte; }; -#define MAX_INSN_SIZE 16 +#define MAX_INSN_SIZE 15 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 0a8b519226b8..b77f5edb03b0 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -136,10 +136,6 @@ static inline notrace unsigned long arch_local_irq_save(void) #define USERGS_SYSRET32 \ swapgs; \ sysretl -#define ENABLE_INTERRUPTS_SYSEXIT32 \ - swapgs; \ - sti; \ - sysexit #else #define INTERRUPT_RETURN iret @@ -163,22 +159,27 @@ static inline int arch_irqs_disabled(void) return arch_irqs_disabled_flags(flags); } +#endif /* !__ASSEMBLY__ */ +#ifdef __ASSEMBLY__ +#ifdef CONFIG_TRACE_IRQFLAGS +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; #else - -#ifdef CONFIG_X86_64 -#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk -#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ +# define TRACE_IRQS_ON +# define TRACE_IRQS_OFF +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# ifdef CONFIG_X86_64 +# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk +# define LOCKDEP_SYS_EXIT_IRQ \ TRACE_IRQS_ON; \ sti; \ - SAVE_REST; \ - LOCKDEP_SYS_EXIT; \ - RESTORE_REST; \ + call lockdep_sys_exit_thunk; \ cli; \ TRACE_IRQS_OFF; - -#else -#define ARCH_LOCKDEP_SYS_EXIT \ +# else +# define LOCKDEP_SYS_EXIT \ pushl %eax; \ pushl %ecx; \ pushl %edx; \ @@ -186,24 +187,12 @@ static inline int arch_irqs_disabled(void) popl %edx; \ popl %ecx; \ popl %eax; - -#define ARCH_LOCKDEP_SYS_EXIT_IRQ -#endif - -#ifdef CONFIG_TRACE_IRQFLAGS -# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; -# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; +# define LOCKDEP_SYS_EXIT_IRQ +# endif #else -# define TRACE_IRQS_ON -# define TRACE_IRQS_OFF -#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT -# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ -# else # define LOCKDEP_SYS_EXIT # define LOCKDEP_SYS_EXIT_IRQ -# endif - +#endif #endif /* __ASSEMBLY__ */ + #endif diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 965c47d254aa..5f6051d5d139 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -976,11 +976,6 @@ extern void default_banner(void); PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) - -#define ENABLE_INTERRUPTS_SYSEXIT32 \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ - CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) #endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ec1c93588cef..d2203b5d9538 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -210,8 +210,23 @@ struct x86_hw_tss { unsigned long sp0; unsigned short ss0, __ss0h; unsigned long sp1; - /* ss1 caches MSR_IA32_SYSENTER_CS: */ - unsigned short ss1, __ss1h; + + /* + * We don't use ring 1, so ss1 is a convenient scratch space in + * the same cacheline as sp0. We use ss1 to cache the value in + * MSR_IA32_SYSENTER_CS. When we context switch + * MSR_IA32_SYSENTER_CS, we first check if the new value being + * written matches ss1, and, if it's not, then we wrmsr the new + * value and update ss1. + * + * The only reason we context switch MSR_IA32_SYSENTER_CS is + * that we set it to zero in vm86 tasks to avoid corrupting the + * stack if we were to go through the sysenter path from vm86 + * mode. + */ + unsigned short ss1; /* MSR_IA32_SYSENTER_CS */ + + unsigned short __ss1h; unsigned long sp2; unsigned short ss2, __ss2h; unsigned long __cr3; @@ -276,13 +291,17 @@ struct tss_struct { unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; /* - * .. and then another 0x100 bytes for the emergency kernel stack: + * Space for the temporary SYSENTER stack: */ - unsigned long stack[64]; + unsigned long SYSENTER_stack[64]; } ____cacheline_aligned; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); +DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); + +#ifdef CONFIG_X86_32 +DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); +#endif /* * Save the original ist values for checking stack pointers during debugging @@ -474,7 +493,6 @@ struct thread_struct { #ifdef CONFIG_X86_32 unsigned long sysenter_cs; #else - unsigned long usersp; /* Copy from PDA */ unsigned short es; unsigned short ds; unsigned short fsindex; @@ -564,6 +582,16 @@ static inline void native_swapgs(void) #endif } +static inline unsigned long current_top_of_stack(void) +{ +#ifdef CONFIG_X86_64 + return this_cpu_read_stable(cpu_tss.x86_tss.sp0); +#else + /* sp0 on x86_32 is special in and around vm86 mode. */ + return this_cpu_read_stable(cpu_current_top_of_stack); +#endif +} + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else @@ -761,10 +789,10 @@ extern char ignore_fpu_irq; #define ARCH_HAS_SPINLOCK_PREFETCH #ifdef CONFIG_X86_32 -# define BASE_PREFETCH ASM_NOP4 +# define BASE_PREFETCH "" # define ARCH_HAS_PREFETCH #else -# define BASE_PREFETCH "prefetcht0 (%1)" +# define BASE_PREFETCH "prefetcht0 %P1" #endif /* @@ -775,10 +803,9 @@ extern char ignore_fpu_irq; */ static inline void prefetch(const void *x) { - alternative_input(BASE_PREFETCH, - "prefetchnta (%1)", + alternative_input(BASE_PREFETCH, "prefetchnta %P1", X86_FEATURE_XMM, - "r" (x)); + "m" (*(const char *)x)); } /* @@ -788,10 +815,9 @@ static inline void prefetch(const void *x) */ static inline void prefetchw(const void *x) { - alternative_input(BASE_PREFETCH, - "prefetchw (%1)", - X86_FEATURE_3DNOW, - "r" (x)); + alternative_input(BASE_PREFETCH, "prefetchw %P1", + X86_FEATURE_3DNOWPREFETCH, + "m" (*(const char *)x)); } static inline void spin_lock_prefetch(const void *x) @@ -799,6 +825,9 @@ static inline void spin_lock_prefetch(const void *x) prefetchw(x); } +#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ + TOP_OF_KERNEL_STACK_PADDING) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). @@ -809,39 +838,16 @@ static inline void spin_lock_prefetch(const void *x) #define STACK_TOP_MAX STACK_TOP #define INIT_THREAD { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .sp0 = TOP_OF_INIT_STACK, \ .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ } -/* - * Note that the .io_bitmap member must be extra-big. This is because - * the CPU will access an additional byte beyond the end of the IO - * permission bitmap. The extra byte must be all 1 bits, and must - * be within the limit. - */ -#define INIT_TSS { \ - .x86_tss = { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ - .ss0 = __KERNEL_DS, \ - .ss1 = __KERNEL_CS, \ - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ - }, \ - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ -} - extern unsigned long thread_saved_pc(struct task_struct *tsk); -#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) -#define KSTK_TOP(info) \ -({ \ - unsigned long *__ptr = (unsigned long *)(info); \ - (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ -}) - /* - * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. * This is necessary to guarantee that the entire "struct pt_regs" * is accessible even if the CPU haven't stored the SS/ESP registers * on the stack (interrupt gate does not save these registers @@ -850,11 +856,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); * "struct pt_regs" is possible, but they may contain the * completely wrong values. */ -#define task_pt_regs(task) \ -({ \ - struct pt_regs *__regs__; \ - __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ - __regs__ - 1; \ +#define task_pt_regs(task) \ +({ \ + unsigned long __ptr = (unsigned long)task_stack_page(task); \ + __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ + ((struct pt_regs *)__ptr) - 1; \ }) #define KSTK_ESP(task) (task_pt_regs(task)->sp) @@ -886,11 +892,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ - .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ -} - -#define INIT_TSS { \ - .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ + .sp0 = TOP_OF_INIT_STACK \ } /* @@ -902,11 +904,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); -/* - * User space RSP while inside the SYSCALL fast path - */ -DECLARE_PER_CPU(unsigned long, old_rsp); - #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 86fc2bb82287..19507ffa5d28 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -31,13 +31,17 @@ struct pt_regs { #else /* __i386__ */ struct pt_regs { +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; -/* arguments: non interrupts/non tracing syscalls only save up to here*/ +/* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -47,9 +51,12 @@ struct pt_regs { unsigned long dx; unsigned long si; unsigned long di; +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ unsigned long orig_ax; -/* end of arguments */ -/* cpu exception frame or undefined */ +/* Return frame for iretq */ unsigned long ip; unsigned long cs; unsigned long flags; @@ -89,11 +96,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) } /* - * user_mode_vm(regs) determines whether a register set came from user mode. - * This is true if V8086 mode was enabled OR if the register set was from - * protected mode with RPL-3 CS value. This tricky test checks that with - * one comparison. Many places in the kernel can bypass this full check - * if they have already ruled out V8086 mode, so user_mode(regs) can be used. + * user_mode(regs) determines whether a register set came from user + * mode. On x86_32, this is true if V8086 mode was enabled OR if the + * register set was from protected mode with RPL-3 CS value. This + * tricky test checks that with one comparison. + * + * On x86_64, vm86 mode is mercifully nonexistent, and we don't need + * the extra check. */ static inline int user_mode(struct pt_regs *regs) { @@ -104,16 +113,6 @@ static inline int user_mode(struct pt_regs *regs) #endif } -static inline int user_mode_vm(struct pt_regs *regs) -{ -#ifdef CONFIG_X86_32 - return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= - USER_RPL; -#else - return user_mode(regs); -#endif -} - static inline int v8086_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_32 @@ -138,12 +137,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs) #endif } -#define current_user_stack_pointer() this_cpu_read(old_rsp) -/* ia32 vs. x32 difference */ -#define compat_user_stack_pointer() \ - (test_thread_flag(TIF_IA32) \ - ? current_pt_regs()->sp \ - : this_cpu_read(old_rsp)) +#define current_user_stack_pointer() current_pt_regs()->sp +#define compat_user_stack_pointer() current_pt_regs()->sp #endif #ifdef CONFIG_X86_32 @@ -248,7 +243,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, */ #define arch_ptrace_stop_needed(code, info) \ ({ \ - set_thread_flag(TIF_NOTIFY_RESUME); \ + force_iret(); \ false; \ }) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index db257a58571f..5a9856eb12ba 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -3,8 +3,10 @@ #include <linux/const.h> -/* Constructor for a conventional segment GDT (or LDT) entry */ -/* This is a macro so it can be used in initializers */ +/* + * Constructor for a conventional segment GDT (or LDT) entry. + * This is a macro so it can be used in initializers. + */ #define GDT_ENTRY(flags, base, limit) \ ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ @@ -12,198 +14,228 @@ (((base) & _AC(0x00ffffff,ULL)) << 16) | \ (((limit) & _AC(0x0000ffff,ULL)))) -/* Simple and small GDT entries for booting only */ +/* Simple and small GDT entries for booting only: */ #define GDT_ENTRY_BOOT_CS 2 -#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) +#define GDT_ENTRY_BOOT_DS 3 +#define GDT_ENTRY_BOOT_TSS 4 +#define __BOOT_CS (GDT_ENTRY_BOOT_CS*8) +#define __BOOT_DS (GDT_ENTRY_BOOT_DS*8) +#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8) + +/* + * Bottom two bits of selector give the ring + * privilege level + */ +#define SEGMENT_RPL_MASK 0x3 -#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) -#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) +/* User mode is privilege level 3: */ +#define USER_RPL 0x3 -#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) -#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) +/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */ +#define SEGMENT_TI_MASK 0x4 +/* LDT segment has TI set ... */ +#define SEGMENT_LDT 0x4 +/* ... GDT has it cleared */ +#define SEGMENT_GDT 0x0 -#define SEGMENT_RPL_MASK 0x3 /* - * Bottom two bits of selector give the ring - * privilege level - */ -#define SEGMENT_TI_MASK 0x4 /* Bit 2 is table indicator (LDT/GDT) */ -#define USER_RPL 0x3 /* User mode is privilege level 3 */ -#define SEGMENT_LDT 0x4 /* LDT segment has TI set... */ -#define SEGMENT_GDT 0x0 /* ... GDT has it cleared */ +#define GDT_ENTRY_INVALID_SEG 0 #ifdef CONFIG_X86_32 /* * The layout of the per-CPU GDT under Linux: * - * 0 - null + * 0 - null <=== cacheline #1 * 1 - reserved * 2 - reserved * 3 - reserved * - * 4 - unused <==== new cacheline + * 4 - unused <=== cacheline #2 * 5 - unused * * ------- start of TLS (Thread-Local Storage) segments: * * 6 - TLS segment #1 [ glibc's TLS segment ] * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] - * 8 - TLS segment #3 + * 8 - TLS segment #3 <=== cacheline #3 * 9 - reserved * 10 - reserved * 11 - reserved * * ------- start of kernel segments: * - * 12 - kernel code segment <==== new cacheline + * 12 - kernel code segment <=== cacheline #4 * 13 - kernel data segment * 14 - default user CS * 15 - default user DS - * 16 - TSS + * 16 - TSS <=== cacheline #5 * 17 - LDT * 18 - PNPBIOS support (16->32 gate) * 19 - PNPBIOS support - * 20 - PNPBIOS support + * 20 - PNPBIOS support <=== cacheline #6 * 21 - PNPBIOS support * 22 - PNPBIOS support * 23 - APM BIOS support - * 24 - APM BIOS support + * 24 - APM BIOS support <=== cacheline #7 * 25 - APM BIOS support * * 26 - ESPFIX small SS * 27 - per-cpu [ offset to per-cpu data area ] - * 28 - stack_canary-20 [ for stack protector ] + * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8 * 29 - unused * 30 - unused * 31 - TSS for double fault handler */ -#define GDT_ENTRY_TLS_MIN 6 -#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) +#define GDT_ENTRY_TLS_MIN 6 +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) +#define GDT_ENTRY_KERNEL_CS 12 +#define GDT_ENTRY_KERNEL_DS 13 #define GDT_ENTRY_DEFAULT_USER_CS 14 - #define GDT_ENTRY_DEFAULT_USER_DS 15 +#define GDT_ENTRY_TSS 16 +#define GDT_ENTRY_LDT 17 +#define GDT_ENTRY_PNPBIOS_CS32 18 +#define GDT_ENTRY_PNPBIOS_CS16 19 +#define GDT_ENTRY_PNPBIOS_DS 20 +#define GDT_ENTRY_PNPBIOS_TS1 21 +#define GDT_ENTRY_PNPBIOS_TS2 22 +#define GDT_ENTRY_APMBIOS_BASE 23 + +#define GDT_ENTRY_ESPFIX_SS 26 +#define GDT_ENTRY_PERCPU 27 +#define GDT_ENTRY_STACK_CANARY 28 + +#define GDT_ENTRY_DOUBLEFAULT_TSS 31 -#define GDT_ENTRY_KERNEL_BASE (12) +/* + * Number of entries in the GDT table: + */ +#define GDT_ENTRIES 32 -#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0) +/* + * Segment selector values corresponding to the above entries: + */ -#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1) +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) -#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4) -#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5) +/* segment for calling fn: */ +#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8) +/* code segment for BIOS: */ +#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8) -#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6) -#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11) +/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */ +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32) -#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14) -#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) +/* data segment for BIOS: */ +#define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8) +/* transfer data segment: */ +#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8) +/* another data segment: */ +#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8) -#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15) #ifdef CONFIG_SMP -#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) +# define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8) #else -#define __KERNEL_PERCPU 0 +# define __KERNEL_PERCPU 0 #endif -#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16) #ifdef CONFIG_CC_STACKPROTECTOR -#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) +# define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) #else -#define __KERNEL_STACK_CANARY 0 +# define __KERNEL_STACK_CANARY 0 #endif -#define GDT_ENTRY_DOUBLEFAULT_TSS 31 - -/* - * The GDT has 32 entries - */ -#define GDT_ENTRIES 32 +#else /* 64-bit: */ -/* The PnP BIOS entries in the GDT */ -#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0) -#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1) -#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2) -#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3) -#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4) - -/* The PnP BIOS selectors */ -#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */ -#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */ -#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */ -#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ -#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ +#include <asm/cache.h> +#define GDT_ENTRY_KERNEL32_CS 1 +#define GDT_ENTRY_KERNEL_CS 2 +#define GDT_ENTRY_KERNEL_DS 3 /* - * Matching rules for certain types of segments. + * We cannot use the same code segment descriptor for user and kernel mode, + * not even in long flat mode, because of different DPL. + * + * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes + * selectors: + * + * if returning to 32-bit userspace: cs = STAR.SYSRET_CS, + * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16, + * + * ss = STAR.SYSRET_CS+8 (in either case) + * + * thus USER_DS should be between 32-bit and 64-bit code selectors: */ +#define GDT_ENTRY_DEFAULT_USER32_CS 4 +#define GDT_ENTRY_DEFAULT_USER_DS 5 +#define GDT_ENTRY_DEFAULT_USER_CS 6 -/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ -#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) - +/* Needs two entries */ +#define GDT_ENTRY_TSS 8 +/* Needs two entries */ +#define GDT_ENTRY_LDT 10 -#else -#include <asm/cache.h> - -#define GDT_ENTRY_KERNEL32_CS 1 -#define GDT_ENTRY_KERNEL_CS 2 -#define GDT_ENTRY_KERNEL_DS 3 +#define GDT_ENTRY_TLS_MIN 12 +#define GDT_ENTRY_TLS_MAX 14 -#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8) +/* Abused to load per CPU data from limit */ +#define GDT_ENTRY_PER_CPU 15 /* - * we cannot use the same code segment descriptor for user and kernel - * -- not even in the long flat mode, because of different DPL /kkeil - * The segment offset needs to contain a RPL. Grr. -AK - * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) + * Number of entries in the GDT table: */ -#define GDT_ENTRY_DEFAULT_USER32_CS 4 -#define GDT_ENTRY_DEFAULT_USER_DS 5 -#define GDT_ENTRY_DEFAULT_USER_CS 6 -#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3) -#define __USER32_DS __USER_DS - -#define GDT_ENTRY_TSS 8 /* needs two entries */ -#define GDT_ENTRY_LDT 10 /* needs two entries */ -#define GDT_ENTRY_TLS_MIN 12 -#define GDT_ENTRY_TLS_MAX 14 - -#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */ -#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3) +#define GDT_ENTRIES 16 -/* TLS indexes for 64bit - hardcoded in arch_prctl */ -#define FS_TLS 0 -#define GS_TLS 1 - -#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) -#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) - -#define GDT_ENTRIES 16 +/* + * Segment selector values corresponding to the above entries: + * + * Note, selectors also need to have a correct RPL, + * expressed with the +3 value for user-space selectors: + */ +#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8) +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) +#define __USER32_DS __USER_DS +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) +#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3) + +/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */ +#define FS_TLS 0 +#define GS_TLS 1 + +#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) +#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) #endif -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) #ifndef CONFIG_PARAVIRT -#define get_kernel_rpl() 0 +# define get_kernel_rpl() 0 #endif -#define IDT_ENTRIES 256 -#define NUM_EXCEPTION_VECTORS 32 -/* Bitmask of exception vectors which push an error code on the stack */ -#define EXCEPTION_ERRCODE_MASK 0x00027d00 -#define GDT_SIZE (GDT_ENTRIES * 8) -#define GDT_ENTRY_TLS_ENTRIES 3 -#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) +#define IDT_ENTRIES 256 +#define NUM_EXCEPTION_VECTORS 32 + +/* Bitmask of exception vectors which push an error code on the stack: */ +#define EXCEPTION_ERRCODE_MASK 0x00027d00 + +#define GDT_SIZE (GDT_ENTRIES*8) +#define GDT_ENTRY_TLS_ENTRIES 3 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) #ifdef __KERNEL__ #ifndef __ASSEMBLY__ + extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; #ifdef CONFIG_TRACING -#define trace_early_idt_handlers early_idt_handlers +# define trace_early_idt_handlers early_idt_handlers #endif /* @@ -228,37 +260,30 @@ do { \ } while (0) /* - * Save a segment register away + * Save a segment register away: */ #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") /* - * x86_32 user gs accessors. + * x86-32 user GS accessors: */ #ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_32_LAZY_GS -#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) -#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) -#define task_user_gs(tsk) ((tsk)->thread.gs) -#define lazy_save_gs(v) savesegment(gs, (v)) -#define lazy_load_gs(v) loadsegment(gs, (v)) -#else /* X86_32_LAZY_GS */ -#define get_user_gs(regs) (u16)((regs)->gs) -#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) -#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) -#define lazy_save_gs(v) do { } while (0) -#define lazy_load_gs(v) do { } while (0) -#endif /* X86_32_LAZY_GS */ +# ifdef CONFIG_X86_32_LAZY_GS +# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; }) +# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +# define task_user_gs(tsk) ((tsk)->thread.gs) +# define lazy_save_gs(v) savesegment(gs, (v)) +# define lazy_load_gs(v) loadsegment(gs, (v)) +# else /* X86_32_LAZY_GS */ +# define get_user_gs(regs) (u16)((regs)->gs) +# define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) +# define task_user_gs(tsk) (task_pt_regs(tsk)->gs) +# define lazy_save_gs(v) do { } while (0) +# define lazy_load_gs(v) do { } while (0) +# endif /* X86_32_LAZY_GS */ #endif /* X86_32 */ -static inline unsigned long get_limit(unsigned long segment) -{ - unsigned long __limit; - asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); - return __limit + 1; -} - #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ff4e7b236e21..f69e06b283fb 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -66,6 +66,11 @@ static inline void x86_ce4100_early_setup(void) { } */ extern struct boot_params boot_params; +static inline bool kaslr_enabled(void) +{ + return !!(boot_params.hdr.loadflags & KASLR_FLAG); +} + /* * Do NOT EVER look at the BIOS memory size location. * It does not work on many machines. diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 9dfce4e0417d..6fe6b182c998 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -57,9 +57,9 @@ struct sigcontext { unsigned long ip; unsigned long flags; unsigned short cs; - unsigned short gs; - unsigned short fs; - unsigned short __pad0; + unsigned short __pad2; /* Was called gs, but was always zero. */ + unsigned short __pad1; /* Was called fs, but was always zero. */ + unsigned short ss; unsigned long err; unsigned long trapno; unsigned long oldmask; diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 7a958164088c..89db46752a8f 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -13,9 +13,7 @@ X86_EFLAGS_CF | X86_EFLAGS_RF) void signal_fault(struct pt_regs *regs, void __user *frame, char *where); - -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax); +int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc); int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask); diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 8d3120f4e270..ba665ebd17bb 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -27,23 +27,11 @@ #ifdef CONFIG_X86_SMAP -#define ASM_CLAC \ - 661: ASM_NOP3 ; \ - .pushsection .altinstr_replacement, "ax" ; \ - 662: __ASM_CLAC ; \ - .popsection ; \ - .pushsection .altinstructions, "a" ; \ - altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ - .popsection - -#define ASM_STAC \ - 661: ASM_NOP3 ; \ - .pushsection .altinstr_replacement, "ax" ; \ - 662: __ASM_STAC ; \ - .popsection ; \ - .pushsection .altinstructions, "a" ; \ - altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ - .popsection +#define ASM_CLAC \ + ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP + +#define ASM_STAC \ + ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP #else /* CONFIG_X86_SMAP */ @@ -61,20 +49,20 @@ static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ - alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); + alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP); } static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ - alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); + alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); } /* These macros can be used in asm() statements */ #define ASM_CLAC \ - ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP) #define ASM_STAC \ - ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP) #else /* CONFIG_X86_SMAP */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 8cd1cc3bc835..81d02fc7dafa 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -154,6 +154,7 @@ void cpu_die_common(unsigned int cpu); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); +void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); void native_cpu_die(unsigned int cpu); diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 6a4b00fafb00..aeb4666e0c0a 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -4,6 +4,8 @@ #ifdef __KERNEL__ +#include <asm/nops.h> + static inline void native_clts(void) { asm volatile("clts"); @@ -199,6 +201,28 @@ static inline void clflushopt(volatile void *__p) "+m" (*(volatile char __force *)__p)); } +static inline void clwb(volatile void *__p) +{ + volatile struct { char x[64]; } *p = __p; + + asm volatile(ALTERNATIVE_2( + ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])", + ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */ + X86_FEATURE_CLFLUSHOPT, + ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */ + X86_FEATURE_CLWB) + : [p] "+m" (*p) + : [pax] "a" (p)); +} + +static inline void pcommit_sfence(void) +{ + alternative(ASM_NOP7, + ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */ + "sfence", + X86_FEATURE_PCOMMIT); +} + #define nop() asm volatile ("nop") diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 1d4e4f279a32..ea2dbe82cba3 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -13,6 +13,33 @@ #include <asm/types.h> /* + * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we + * reserve at the top of the kernel stack. We do it because of a nasty + * 32-bit corner case. On x86_32, the hardware stack frame is + * variable-length. Except for vm86 mode, struct pt_regs assumes a + * maximum-length frame. If we enter from CPL 0, the top 8 bytes of + * pt_regs don't actually exist. Ordinarily this doesn't matter, but it + * does in at least one case: + * + * If we take an NMI early enough in SYSENTER, then we can end up with + * pt_regs that extends above sp0. On the way out, in the espfix code, + * we can read the saved SS value, but that value will be above sp0. + * Without this offset, that can result in a page fault. (We are + * careful that, in this case, the value we read doesn't matter.) + * + * In vm86 mode, the hardware frame is much longer still, but we neither + * access the extra members from NMI context, nor do we write such a + * frame at sp0 at all. + * + * x86_64 has a fixed-length stack frame. + */ +#ifdef CONFIG_X86_32 +# define TOP_OF_KERNEL_STACK_PADDING 8 +#else +# define TOP_OF_KERNEL_STACK_PADDING 0 +#endif + +/* * low level task data that entry.S needs immediate access to * - this struct should fit entirely inside of one cache line * - this struct shares the supervisor stack pages @@ -145,7 +172,6 @@ struct thread_info { #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) #define STACK_WARN (THREAD_SIZE/8) -#define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8)) /* * macros/functions for gaining access to the thread information structure @@ -158,10 +184,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { - struct thread_info *ti; - ti = (void *)(this_cpu_read_stable(kernel_stack) + - KERNEL_STACK_OFFSET - THREAD_SIZE); - return ti; + return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); } static inline unsigned long current_stack_pointer(void) @@ -177,16 +200,37 @@ static inline unsigned long current_stack_pointer(void) #else /* !__ASSEMBLY__ */ -/* how to get the thread information struct from ASM */ +/* Load thread_info address into "reg" */ #define GET_THREAD_INFO(reg) \ _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ - _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ; + _ASM_SUB $(THREAD_SIZE),reg ; /* - * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in - * a certain register (to be used in assembler memory operands). + * ASM operand which evaluates to a 'thread_info' address of + * the current task, if it is known that "reg" is exactly "off" + * bytes below the top of the stack currently. + * + * ( The kernel stack's size is known at build time, it is usually + * 2 or 4 pages, and the bottom of the kernel stack contains + * the thread_info structure. So to access the thread_info very + * quickly from assembly code we can calculate down from the + * top of the kernel stack to the bottom, using constant, + * build-time calculations only. ) + * + * For example, to fetch the current thread_info->flags value into %eax + * on x86-64 defconfig kernels, in syscall entry code where RSP is + * currently at exactly SIZEOF_PTREGS bytes away from the top of the + * stack: + * + * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax + * + * will translate to: + * + * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax + * + * which is below the current RSP by almost 16K. */ -#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg) +#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg) #endif @@ -236,6 +280,16 @@ static inline bool is_ia32_task(void) #endif return false; } + +/* + * Force syscall return via IRET by making it look as if there was + * some work pending. IRET is our most capable (but slowest) syscall + * return path, which is able to restore modified SS, CS and certain + * EFLAGS values that other (fast) syscall return instructions + * are not able to restore properly. + */ +#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME) + #endif /* !__ASSEMBLY__ */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 225b0988043a..ab456dc233b5 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -15,6 +15,7 @@ /* loadflags */ #define LOADED_HIGH (1<<0) +#define KASLR_FLAG (1<<1) #define QUIET_FLAG (1<<5) #define KEEP_SEGMENTS (1<<6) #define CAN_USE_HEAP (1<<7) diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h index 7b0a55a88851..580aee3072e0 100644 --- a/arch/x86/include/uapi/asm/ptrace-abi.h +++ b/arch/x86/include/uapi/asm/ptrace-abi.h @@ -25,13 +25,17 @@ #else /* __i386__ */ #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ #define R15 0 #define R14 8 #define R13 16 #define R12 24 #define RBP 32 #define RBX 40 -/* arguments: interrupts/non tracing syscalls only save up to here*/ +/* These regs are callee-clobbered. Always saved on kernel entry. */ #define R11 48 #define R10 56 #define R9 64 @@ -41,15 +45,17 @@ #define RDX 96 #define RSI 104 #define RDI 112 -#define ORIG_RAX 120 /* = ERROR */ -/* end of arguments */ -/* cpu exception frame or undefined in case of fast syscall. */ +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ +#define ORIG_RAX 120 +/* Return frame for iretq */ #define RIP 128 #define CS 136 #define EFLAGS 144 #define RSP 152 #define SS 160 -#define ARGOFFSET R11 #endif /* __ASSEMBLY__ */ /* top of stack page */ diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h index ac4b9aa4d999..bc16115af39b 100644 --- a/arch/x86/include/uapi/asm/ptrace.h +++ b/arch/x86/include/uapi/asm/ptrace.h @@ -41,13 +41,17 @@ struct pt_regs { #ifndef __KERNEL__ struct pt_regs { +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long rbp; unsigned long rbx; -/* arguments: non interrupts/non tracing syscalls only save up to here*/ +/* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -57,9 +61,12 @@ struct pt_regs { unsigned long rdx; unsigned long rsi; unsigned long rdi; +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ unsigned long orig_rax; -/* end of arguments */ -/* cpu exception frame or undefined */ +/* Return frame for iretq */ unsigned long rip; unsigned long cs; unsigned long eflags; diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index d8b9f9081e86..16dc4e8a2cd3 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -177,9 +177,24 @@ struct sigcontext { __u64 rip; __u64 eflags; /* RFLAGS */ __u16 cs; - __u16 gs; - __u16 fs; - __u16 __pad0; + + /* + * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"), + * Linux saved and restored fs and gs in these slots. This + * was counterproductive, as fsbase and gsbase were never + * saved, so arch_prctl was presumably unreliable. + * + * If these slots are ever needed for any other purpose, there + * is some risk that very old 64-bit binaries could get + * confused. I doubt that many such binaries still work, + * though, since the same patch in 2.5.64 also removed the + * 64-bit set_thread_area syscall, so it appears that there is + * no TLS API that works in both pre- and post-2.5.64 kernels. + */ + __u16 __pad2; /* Was gs. */ + __u16 __pad1; /* Was fs. */ + + __u16 ss; __u64 err; __u64 trapno; __u64 oldmask; |