diff options
Diffstat (limited to 'arch/x86')
79 files changed, 1184 insertions, 836 deletions
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 246c67006ed0..8c1fcb6bad21 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -33,7 +33,7 @@ #define s3 ((16 + 2 + (3 * 256)) * 4) /* register macros */ -#define CTX %rdi +#define CTX %r12 #define RIO %rsi #define RX0 %rax @@ -56,12 +56,12 @@ #define RX2bh %ch #define RX3bh %dh -#define RT0 %rbp +#define RT0 %rdi #define RT1 %rsi #define RT2 %r8 #define RT3 %r9 -#define RT0d %ebp +#define RT0d %edi #define RT1d %esi #define RT2d %r8d #define RT3d %r9d @@ -120,13 +120,14 @@ ENTRY(__blowfish_enc_blk) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: bool, if true: xor output */ - movq %rbp, %r11; + movq %r12, %r11; + movq %rdi, CTX; movq %rsi, %r10; movq %rdx, RIO; @@ -142,7 +143,7 @@ ENTRY(__blowfish_enc_blk) round_enc(14); add_roundkey_enc(16); - movq %r11, %rbp; + movq %r11, %r12; movq %r10, RIO; test %cl, %cl; @@ -157,12 +158,13 @@ ENDPROC(__blowfish_enc_blk) ENTRY(blowfish_dec_blk) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ - movq %rbp, %r11; + movq %r12, %r11; + movq %rdi, CTX; movq %rsi, %r10; movq %rdx, RIO; @@ -181,7 +183,7 @@ ENTRY(blowfish_dec_blk) movq %r10, RIO; write_block(); - movq %r11, %rbp; + movq %r11, %r12; ret; ENDPROC(blowfish_dec_blk) @@ -298,20 +300,21 @@ ENDPROC(blowfish_dec_blk) ENTRY(__blowfish_enc_blk_4way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: bool, if true: xor output */ - pushq %rbp; + pushq %r12; pushq %rbx; pushq %rcx; - preload_roundkey_enc(0); - + movq %rdi, CTX movq %rsi, %r11; movq %rdx, RIO; + preload_roundkey_enc(0); + read_block4(); round_enc4(0); @@ -324,39 +327,40 @@ ENTRY(__blowfish_enc_blk_4way) round_enc4(14); add_preloaded_roundkey4(); - popq %rbp; + popq %r12; movq %r11, RIO; - test %bpl, %bpl; + test %r12b, %r12b; jnz .L__enc_xor4; write_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; .L__enc_xor4: xor_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; ENDPROC(__blowfish_enc_blk_4way) ENTRY(blowfish_dec_blk_4way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ - pushq %rbp; + pushq %r12; pushq %rbx; - preload_roundkey_dec(17); - movq %rsi, %r11; + movq %rdi, CTX; + movq %rsi, %r11 movq %rdx, RIO; + preload_roundkey_dec(17); read_block4(); round_dec4(17); @@ -373,7 +377,7 @@ ENTRY(blowfish_dec_blk_4way) write_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; ENDPROC(blowfish_dec_blk_4way) diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 310319c601ed..95ba6956a7f6 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -75,17 +75,17 @@ #define RCD1bh %dh #define RT0 %rsi -#define RT1 %rbp +#define RT1 %r12 #define RT2 %r8 #define RT0d %esi -#define RT1d %ebp +#define RT1d %r12d #define RT2d %r8d #define RT2bl %r8b #define RXOR %r9 -#define RRBP %r10 +#define RR12 %r10 #define RDST %r11 #define RXORd %r9d @@ -197,7 +197,7 @@ ENTRY(__camellia_enc_blk) * %rdx: src * %rcx: bool xor */ - movq %rbp, RRBP; + movq %r12, RR12; movq %rcx, RXOR; movq %rsi, RDST; @@ -227,13 +227,13 @@ ENTRY(__camellia_enc_blk) enc_outunpack(mov, RT1); - movq RRBP, %rbp; + movq RR12, %r12; ret; .L__enc_xor: enc_outunpack(xor, RT1); - movq RRBP, %rbp; + movq RR12, %r12; ret; ENDPROC(__camellia_enc_blk) @@ -248,7 +248,7 @@ ENTRY(camellia_dec_blk) movl $24, RXORd; cmovel RXORd, RT2d; /* max */ - movq %rbp, RRBP; + movq %r12, RR12; movq %rsi, RDST; movq %rdx, RIO; @@ -271,7 +271,7 @@ ENTRY(camellia_dec_blk) dec_outunpack(); - movq RRBP, %rbp; + movq RR12, %r12; ret; ENDPROC(camellia_dec_blk) @@ -433,7 +433,7 @@ ENTRY(__camellia_enc_blk_2way) */ pushq %rbx; - movq %rbp, RRBP; + movq %r12, RR12; movq %rcx, RXOR; movq %rsi, RDST; movq %rdx, RIO; @@ -461,14 +461,14 @@ ENTRY(__camellia_enc_blk_2way) enc_outunpack2(mov, RT2); - movq RRBP, %rbp; + movq RR12, %r12; popq %rbx; ret; .L__enc2_xor: enc_outunpack2(xor, RT2); - movq RRBP, %rbp; + movq RR12, %r12; popq %rbx; ret; ENDPROC(__camellia_enc_blk_2way) @@ -485,7 +485,7 @@ ENTRY(camellia_dec_blk_2way) cmovel RXORd, RT2d; /* max */ movq %rbx, RXOR; - movq %rbp, RRBP; + movq %r12, RR12; movq %rsi, RDST; movq %rdx, RIO; @@ -508,7 +508,7 @@ ENTRY(camellia_dec_blk_2way) dec_outunpack2(); - movq RRBP, %rbp; + movq RR12, %r12; movq RXOR, %rbx; ret; ENDPROC(camellia_dec_blk_2way) diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index b4a8806234ea..86107c961bb4 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -47,7 +47,7 @@ /********************************************************************** 16-way AVX cast5 **********************************************************************/ -#define CTX %rdi +#define CTX %r15 #define RL1 %xmm0 #define RR1 %xmm1 @@ -70,8 +70,8 @@ #define RTMP %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %rdi +#define RID1d %edi #define RID2 %rsi #define RID2d %esi @@ -226,7 +226,7 @@ .align 16 __cast5_enc_blk16: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RL1: blocks 1 and 2 * RR1: blocks 3 and 4 * RL2: blocks 5 and 6 @@ -246,9 +246,11 @@ __cast5_enc_blk16: * RR4: encrypted blocks 15 and 16 */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -283,7 +285,7 @@ __cast5_enc_blk16: .L__skip_enc: popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; @@ -298,7 +300,7 @@ ENDPROC(__cast5_enc_blk16) .align 16 __cast5_dec_blk16: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RL1: encrypted blocks 1 and 2 * RR1: encrypted blocks 3 and 4 * RL2: encrypted blocks 5 and 6 @@ -318,9 +320,11 @@ __cast5_dec_blk16: * RR4: decrypted blocks 15 and 16 */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -356,7 +360,7 @@ __cast5_dec_blk16: vmovdqa .Lbswap_mask, RKM; popq %rbx; - popq %rbp; + popq %r15; outunpack_blocks(RR1, RL1, RTMP, RX, RKM); outunpack_blocks(RR2, RL2, RTMP, RX, RKM); @@ -372,12 +376,14 @@ ENDPROC(__cast5_dec_blk16) ENTRY(cast5_ecb_enc_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; vmovdqu (0*4*4)(%rdx), RL1; @@ -400,18 +406,22 @@ ENTRY(cast5_ecb_enc_16way) vmovdqu RR4, (6*4*4)(%r11); vmovdqu RL4, (7*4*4)(%r11); + popq %r15; FRAME_END ret; ENDPROC(cast5_ecb_enc_16way) ENTRY(cast5_ecb_dec_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + + movq %rdi, CTX; movq %rsi, %r11; vmovdqu (0*4*4)(%rdx), RL1; @@ -434,20 +444,22 @@ ENTRY(cast5_ecb_dec_16way) vmovdqu RR4, (6*4*4)(%r11); vmovdqu RL4, (7*4*4)(%r11); + popq %r15; FRAME_END ret; ENDPROC(cast5_ecb_dec_16way) ENTRY(cast5_cbc_dec_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -483,23 +495,24 @@ ENTRY(cast5_cbc_dec_16way) vmovdqu RR4, (6*16)(%r11); vmovdqu RL4, (7*16)(%r11); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast5_cbc_dec_16way) ENTRY(cast5_ctr_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: iv (big endian, 64bit) */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -558,8 +571,8 @@ ENTRY(cast5_ctr_16way) vmovdqu RR4, (6*16)(%r11); vmovdqu RL4, (7*16)(%r11); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast5_ctr_16way) diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 952d3156a933..7f30b6f0d72c 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -47,7 +47,7 @@ /********************************************************************** 8-way AVX cast6 **********************************************************************/ -#define CTX %rdi +#define CTX %r15 #define RA1 %xmm0 #define RB1 %xmm1 @@ -70,8 +70,8 @@ #define RTMP %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %rdi +#define RID1d %edi #define RID2 %rsi #define RID2d %esi @@ -264,15 +264,17 @@ .align 8 __cast6_enc_blk8: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks * output: * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -297,7 +299,7 @@ __cast6_enc_blk8: QBAR(11); popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; @@ -310,15 +312,17 @@ ENDPROC(__cast6_enc_blk8) .align 8 __cast6_dec_blk8: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks * output: * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -343,7 +347,7 @@ __cast6_dec_blk8: QBAR(0); popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); @@ -354,12 +358,14 @@ ENDPROC(__cast6_dec_blk8) ENTRY(cast6_ecb_enc_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -368,18 +374,21 @@ ENTRY(cast6_ecb_enc_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_ecb_enc_8way) ENTRY(cast6_ecb_dec_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -388,20 +397,22 @@ ENTRY(cast6_ecb_dec_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_ecb_dec_8way) ENTRY(cast6_cbc_dec_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -411,8 +422,8 @@ ENTRY(cast6_cbc_dec_8way) store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast6_cbc_dec_8way) @@ -425,9 +436,10 @@ ENTRY(cast6_ctr_8way) * %rcx: iv (little endian, 128bit) */ FRAME_BEGIN - pushq %r12; + pushq %r15 + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -438,8 +450,8 @@ ENTRY(cast6_ctr_8way) store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast6_ctr_8way) @@ -452,7 +464,9 @@ ENTRY(cast6_xts_enc_8way) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX movq %rsi, %r11; /* regs <= src, dst <= IVs, regs <= regs xor IVs */ @@ -464,6 +478,7 @@ ENTRY(cast6_xts_enc_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_xts_enc_8way) @@ -476,7 +491,9 @@ ENTRY(cast6_xts_dec_8way) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX movq %rsi, %r11; /* regs <= src, dst <= IVs, regs <= regs xor IVs */ @@ -488,6 +505,7 @@ ENTRY(cast6_xts_dec_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_xts_dec_8way) diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S index f3e91647ca27..8e49ce117494 100644 --- a/arch/x86/crypto/des3_ede-asm_64.S +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -64,12 +64,12 @@ #define RW2bh %ch #define RT0 %r15 -#define RT1 %rbp +#define RT1 %rsi #define RT2 %r14 #define RT3 %rdx #define RT0d %r15d -#define RT1d %ebp +#define RT1d %esi #define RT2d %r14d #define RT3d %edx @@ -177,13 +177,14 @@ ENTRY(des3_ede_x86_64_crypt_blk) * %rsi: dst * %rdx: src */ - pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; pushq %r14; pushq %r15; + pushq %rsi; /* dst */ + read_block(%rdx, RL0, RR0); initial_permutation(RL0, RR0); @@ -241,6 +242,8 @@ ENTRY(des3_ede_x86_64_crypt_blk) round1(32+15, RL0, RR0, dummy2); final_permutation(RR0, RL0); + + popq %rsi /* dst */ write_block(%rsi, RR0, RL0); popq %r15; @@ -248,7 +251,6 @@ ENTRY(des3_ede_x86_64_crypt_blk) popq %r13; popq %r12; popq %rbx; - popq %rbp; ret; ENDPROC(des3_ede_x86_64_crypt_blk) @@ -432,13 +434,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) * %rdx: src (3 blocks) */ - pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; pushq %r14; pushq %r15; + pushq %rsi /* dst */ + /* load input */ movl 0 * 4(%rdx), RL0d; movl 1 * 4(%rdx), RR0d; @@ -520,6 +523,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) bswapl RR2d; bswapl RL2d; + popq %rsi /* dst */ movl RR0d, 0 * 4(%rsi); movl RL0d, 1 * 4(%rsi); movl RR1d, 2 * 4(%rsi); @@ -532,7 +536,6 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) popq %r13; popq %r12; popq %rbx; - popq %rbp; ret; ENDPROC(des3_ede_x86_64_crypt_blk_3way) diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1eab79c9ac48..9f712a7dfd79 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -89,7 +89,7 @@ #define REG_RE %rdx #define REG_RTA %r12 #define REG_RTB %rbx -#define REG_T1 %ebp +#define REG_T1 %r11d #define xmm_mov vmovups #define avx2_zeroupper vzeroupper #define RND_F1 1 @@ -637,7 +637,6 @@ _loop3: ENTRY(\name) push %rbx - push %rbp push %r12 push %r13 push %r14 @@ -673,7 +672,6 @@ _loop3: pop %r14 pop %r13 pop %r12 - pop %rbp pop %rbx ret diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index a4109506a5e8..6204bd53528c 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -37,7 +37,7 @@ #define REG_A %ecx #define REG_B %esi #define REG_C %edi -#define REG_D %ebp +#define REG_D %r12d #define REG_E %edx #define REG_T1 %eax @@ -74,10 +74,10 @@ ENTRY(\name) push %rbx - push %rbp push %r12 + push %rbp + mov %rsp, %rbp - mov %rsp, %r12 sub $64, %rsp # allocate workspace and $~15, %rsp # align stack @@ -99,10 +99,9 @@ xor %rax, %rax rep stosq - mov %r12, %rsp # deallocate workspace - - pop %r12 + mov %rbp, %rsp # deallocate workspace pop %rbp + pop %r12 pop %rbx ret diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index e08888a1a5f2..001bbcf93c79 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -103,7 +103,7 @@ SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx -TBL = %rbp +TBL = %r12 a = %eax b = %ebx @@ -350,13 +350,13 @@ a = TMP_ ENTRY(sha256_transform_avx) .align 32 pushq %rbx - pushq %rbp + pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %r12 + pushq %rbp + movq %rsp, %rbp - mov %rsp, %r12 subq $STACK_SIZE, %rsp # allocate stack space and $~15, %rsp # align stack pointer @@ -452,13 +452,12 @@ loop2: done_hash: - mov %r12, %rsp - - popq %r12 + mov %rbp, %rsp + popq %rbp popq %r15 popq %r14 popq %r13 - popq %rbp + popq %r12 popq %rbx ret ENDPROC(sha256_transform_avx) diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 89c8f09787d2..1420db15dcdd 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -98,8 +98,6 @@ d = %r8d e = %edx # clobbers NUM_BLKS y3 = %esi # clobbers INP - -TBL = %rbp SRND = CTX # SRND is same register as CTX a = %eax @@ -531,7 +529,6 @@ STACK_SIZE = _RSP + _RSP_SIZE ENTRY(sha256_transform_rorx) .align 32 pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 @@ -568,8 +565,6 @@ ENTRY(sha256_transform_rorx) mov CTX, _CTX(%rsp) loop0: - lea K256(%rip), TBL - ## Load first 16 dwords from two blocks VMOVDQ 0*32(INP),XTMP0 VMOVDQ 1*32(INP),XTMP1 @@ -597,19 +592,19 @@ last_block_enter: .align 16 loop1: - vpaddd 0*32(TBL, SRND), X0, XFER + vpaddd K256+0*32(SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 0*32 - vpaddd 1*32(TBL, SRND), X0, XFER + vpaddd K256+1*32(SRND), X0, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 1*32 - vpaddd 2*32(TBL, SRND), X0, XFER + vpaddd K256+2*32(SRND), X0, XFER vmovdqa XFER, 2*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 2*32 - vpaddd 3*32(TBL, SRND), X0, XFER + vpaddd K256+3*32(SRND), X0, XFER vmovdqa XFER, 3*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 3*32 @@ -619,10 +614,11 @@ loop1: loop2: ## Do last 16 rounds with no scheduling - vpaddd 0*32(TBL, SRND), X0, XFER + vpaddd K256+0*32(SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 0*32 - vpaddd 1*32(TBL, SRND), X1, XFER + + vpaddd K256+1*32(SRND), X1, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 1*32 add $2*32, SRND @@ -676,9 +672,6 @@ loop3: ja done_hash do_last_block: - #### do last block - lea K256(%rip), TBL - VMOVDQ 0*16(INP),XWORD0 VMOVDQ 1*16(INP),XWORD1 VMOVDQ 2*16(INP),XWORD2 @@ -718,7 +711,6 @@ done_hash: popq %r14 popq %r13 popq %r12 - popq %rbp popq %rbx ret ENDPROC(sha256_transform_rorx) diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index 39b83c93e7fd..c6c05ed2c16a 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -95,7 +95,7 @@ SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx -TBL = %rbp +TBL = %r12 a = %eax b = %ebx @@ -356,13 +356,13 @@ a = TMP_ ENTRY(sha256_transform_ssse3) .align 32 pushq %rbx - pushq %rbp + pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %r12 + pushq %rbp + mov %rsp, %rbp - mov %rsp, %r12 subq $STACK_SIZE, %rsp and $~15, %rsp @@ -462,13 +462,12 @@ loop2: done_hash: - mov %r12, %rsp - - popq %r12 + mov %rbp, %rsp + popq %rbp popq %r15 popq %r14 popq %r13 - popq %rbp + popq %r12 popq %rbx ret diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 7f5f6c6ec72e..b16d56005162 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -69,8 +69,9 @@ XFER = YTMP0 BYTE_FLIP_MASK = %ymm9 -# 1st arg -CTX = %rdi +# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 +CTX1 = %rdi +CTX2 = %r12 # 2nd arg INP = %rsi # 3rd arg @@ -81,7 +82,7 @@ d = %r8 e = %rdx y3 = %rsi -TBL = %rbp +TBL = %rdi # clobbers CTX1 a = %rax b = %rbx @@ -91,26 +92,26 @@ g = %r10 h = %r11 old_h = %r11 -T1 = %r12 +T1 = %r12 # clobbers CTX2 y0 = %r13 y1 = %r14 y2 = %r15 -y4 = %r12 - # Local variables (stack frame) XFER_SIZE = 4*8 SRND_SIZE = 1*8 INP_SIZE = 1*8 INPEND_SIZE = 1*8 +CTX_SIZE = 1*8 RSPSAVE_SIZE = 1*8 -GPRSAVE_SIZE = 6*8 +GPRSAVE_SIZE = 5*8 frame_XFER = 0 frame_SRND = frame_XFER + XFER_SIZE frame_INP = frame_SRND + SRND_SIZE frame_INPEND = frame_INP + INP_SIZE -frame_RSPSAVE = frame_INPEND + INPEND_SIZE +frame_CTX = frame_INPEND + INPEND_SIZE +frame_RSPSAVE = frame_CTX + CTX_SIZE frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE frame_size = frame_GPRSAVE + GPRSAVE_SIZE @@ -576,12 +577,11 @@ ENTRY(sha512_transform_rorx) mov %rax, frame_RSPSAVE(%rsp) # Save GPRs - mov %rbp, frame_GPRSAVE(%rsp) - mov %rbx, 8*1+frame_GPRSAVE(%rsp) - mov %r12, 8*2+frame_GPRSAVE(%rsp) - mov %r13, 8*3+frame_GPRSAVE(%rsp) - mov %r14, 8*4+frame_GPRSAVE(%rsp) - mov %r15, 8*5+frame_GPRSAVE(%rsp) + mov %rbx, 8*0+frame_GPRSAVE(%rsp) + mov %r12, 8*1+frame_GPRSAVE(%rsp) + mov %r13, 8*2+frame_GPRSAVE(%rsp) + mov %r14, 8*3+frame_GPRSAVE(%rsp) + mov %r15, 8*4+frame_GPRSAVE(%rsp) shl $7, NUM_BLKS # convert to bytes jz done_hash @@ -589,14 +589,17 @@ ENTRY(sha512_transform_rorx) mov NUM_BLKS, frame_INPEND(%rsp) ## load initial digest - mov 8*0(CTX),a - mov 8*1(CTX),b - mov 8*2(CTX),c - mov 8*3(CTX),d - mov 8*4(CTX),e - mov 8*5(CTX),f - mov 8*6(CTX),g - mov 8*7(CTX),h + mov 8*0(CTX1), a + mov 8*1(CTX1), b + mov 8*2(CTX1), c + mov 8*3(CTX1), d + mov 8*4(CTX1), e + mov 8*5(CTX1), f + mov 8*6(CTX1), g + mov 8*7(CTX1), h + + # save %rdi (CTX) before it gets clobbered + mov %rdi, frame_CTX(%rsp) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK @@ -652,14 +655,15 @@ loop2: subq $1, frame_SRND(%rsp) jne loop2 - addm 8*0(CTX),a - addm 8*1(CTX),b - addm 8*2(CTX),c - addm 8*3(CTX),d - addm 8*4(CTX),e - addm 8*5(CTX),f - addm 8*6(CTX),g - addm 8*7(CTX),h + mov frame_CTX(%rsp), CTX2 + addm 8*0(CTX2), a + addm 8*1(CTX2), b + addm 8*2(CTX2), c + addm 8*3(CTX2), d + addm 8*4(CTX2), e + addm 8*5(CTX2), f + addm 8*6(CTX2), g + addm 8*7(CTX2), h mov frame_INP(%rsp), INP add $128, INP @@ -669,12 +673,11 @@ loop2: done_hash: # Restore GPRs - mov frame_GPRSAVE(%rsp) ,%rbp - mov 8*1+frame_GPRSAVE(%rsp) ,%rbx - mov 8*2+frame_GPRSAVE(%rsp) ,%r12 - mov 8*3+frame_GPRSAVE(%rsp) ,%r13 - mov 8*4+frame_GPRSAVE(%rsp) ,%r14 - mov 8*5+frame_GPRSAVE(%rsp) ,%r15 + mov 8*0+frame_GPRSAVE(%rsp), %rbx + mov 8*1+frame_GPRSAVE(%rsp), %r12 + mov 8*2+frame_GPRSAVE(%rsp), %r13 + mov 8*3+frame_GPRSAVE(%rsp), %r14 + mov 8*4+frame_GPRSAVE(%rsp), %r15 # Restore Stack Pointer mov frame_RSPSAVE(%rsp), %rsp diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index b3f49d286348..73b471da3622 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -76,8 +76,8 @@ #define RT %xmm14 #define RR %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %r13 +#define RID1d %r13d #define RID2 %rsi #define RID2d %esi @@ -259,7 +259,7 @@ __twofish_enc_blk8: vmovdqu w(CTX), RK1; - pushq %rbp; + pushq %r13; pushq %rbx; pushq %rcx; @@ -282,7 +282,7 @@ __twofish_enc_blk8: popq %rcx; popq %rbx; - popq %rbp; + popq %r13; outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); @@ -301,7 +301,7 @@ __twofish_dec_blk8: vmovdqu (w+4*4)(CTX), RK1; - pushq %rbp; + pushq %r13; pushq %rbx; inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); @@ -322,7 +322,7 @@ __twofish_dec_blk8: vmovdqu (w)(CTX), RK1; popq %rbx; - popq %rbp; + popq %r13; outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 8a13d468635a..50e0d2bc4528 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -176,7 +176,7 @@ /* * This is a sneaky trick to help the unwinder find pt_regs on the stack. The * frame pointer is replaced with an encoded pointer to pt_regs. The encoding - * is just setting the LSB, which makes it an invalid stack address and is also + * is just clearing the MSB, which makes it an invalid stack address and is also * a signal to the unwinder that it's a pt_regs pointer in disguise. * * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the @@ -185,7 +185,7 @@ .macro ENCODE_FRAME_POINTER #ifdef CONFIG_FRAME_POINTER mov %esp, %ebp - orl $0x1, %ebp + andl $0x7fffffff, %ebp #endif .endm diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 829e89cfcee2..9fb9a1f1e47b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4409,10 +4409,9 @@ static __init int fixup_ht_bug(void) return 0; } - if (lockup_detector_suspend() != 0) { - pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n"); - return 0; - } + cpus_read_lock(); + + hardlockup_detector_perf_stop(); x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); @@ -4420,9 +4419,7 @@ static __init int fixup_ht_bug(void) x86_pmu.commit_scheduling = NULL; x86_pmu.stop_scheduling = NULL; - lockup_detector_resume(); - - cpus_read_lock(); + hardlockup_detector_perf_restart(); for_each_online_cpu(c) free_excl_cntrs(c); diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4cf100ff2a37..72db0664a53d 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -552,6 +552,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates), @@ -560,6 +561,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_DENVERTON, glm_cstates), + + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GEMINI_LAKE, glm_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 8e2457cb6b4a..005908ee9333 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -775,6 +775,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_DENVERTON, hsw_rapl_init), + + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GEMINI_LAKE, hsw_rapl_init), {}, }; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 1c5390f1cf09..d45e06346f14 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -822,7 +822,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) pmus[i].type = type; pmus[i].boxes = kzalloc(size, GFP_KERNEL); if (!pmus[i].boxes) - return -ENOMEM; + goto err; } type->pmus = pmus; @@ -836,7 +836,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) + sizeof(*attr_group), GFP_KERNEL); if (!attr_group) - return -ENOMEM; + goto err; attrs = (struct attribute **)(attr_group + 1); attr_group->name = "events"; @@ -849,7 +849,15 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) } type->pmu_group = &uncore_pmu_attr_group; + return 0; + +err: + for (i = 0; i < type->num_boxes; i++) + kfree(pmus[i].boxes); + kfree(pmus); + + return -ENOMEM; } static int __init diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index db1fe377e6dd..a7196818416a 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3462,7 +3462,7 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { static struct intel_uncore_type skx_uncore_iio = { .name = "iio", .num_counters = 4, - .num_boxes = 5, + .num_boxes = 6, .perf_ctr_bits = 48, .event_ctl = SKX_IIO0_MSR_PMON_CTL0, .perf_ctr = SKX_IIO0_MSR_PMON_CTR0, @@ -3492,7 +3492,7 @@ static const struct attribute_group skx_uncore_format_group = { static struct intel_uncore_type skx_uncore_irp = { .name = "irp", .num_counters = 2, - .num_boxes = 5, + .num_boxes = 6, .perf_ctr_bits = 48, .event_ctl = SKX_IRP0_MSR_PMON_CTL0, .perf_ctr = SKX_IRP0_MSR_PMON_CTR0, diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 4bb3ec69e8ea..06723671ae4e 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -63,6 +63,14 @@ static bool test_intel(int idx) case INTEL_FAM6_ATOM_SILVERMONT1: case INTEL_FAM6_ATOM_SILVERMONT2: case INTEL_FAM6_ATOM_AIRMONT: + + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_DENVERTON: + + case INTEL_FAM6_ATOM_GEMINI_LAKE: + + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: if (idx == PERF_MSR_SMI) return true; break; diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 1a8eb550c40f..a5db63f728a2 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -85,6 +85,8 @@ EXPORT_SYMBOL_GPL(hyperv_cs); u32 *hv_vp_index; EXPORT_SYMBOL_GPL(hv_vp_index); +u32 hv_max_vp_index; + static int hv_cpu_init(unsigned int cpu) { u64 msr_vp_index; @@ -93,6 +95,9 @@ static int hv_cpu_init(unsigned int cpu) hv_vp_index[smp_processor_id()] = msr_vp_index; + if (msr_vp_index > hv_max_vp_index) + hv_max_vp_index = msr_vp_index; + return 0; } diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 39e7f6e50919..9cc9e1c1e2db 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -36,9 +36,9 @@ struct hv_flush_pcpu_ex { /* Each gva in gva_list encodes up to 4096 pages to flush */ #define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) -static struct hv_flush_pcpu __percpu *pcpu_flush; +static struct hv_flush_pcpu __percpu **pcpu_flush; -static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex; +static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex; /* * Fills in gva_list starting from offset. Returns the number of items added. @@ -76,6 +76,18 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, { int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; + /* valid_bank_mask can represent up to 64 banks */ + if (hv_max_vp_index / 64 >= 64) + return 0; + + /* + * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex + * structs are not cleared between calls, we risk flushing unneeded + * vCPUs otherwise. + */ + for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++) + flush->hv_vp_set.bank_contents[vcpu_bank] = 0; + /* * Some banks may end up being empty but this is acceptable. */ @@ -83,11 +95,6 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, vcpu = hv_cpu_number_to_vp_number(cpu); vcpu_bank = vcpu / 64; vcpu_offset = vcpu % 64; - - /* valid_bank_mask can represent up to 64 banks */ - if (vcpu_bank >= 64) - return 0; - __set_bit(vcpu_offset, (unsigned long *) &flush->hv_vp_set.bank_contents[vcpu_bank]); if (vcpu_bank >= nr_bank) @@ -102,6 +109,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, const struct flush_tlb_info *info) { int cpu, vcpu, gva_n, max_gvas; + struct hv_flush_pcpu **flush_pcpu; struct hv_flush_pcpu *flush; u64 status = U64_MAX; unsigned long flags; @@ -116,7 +124,17 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, local_irq_save(flags); - flush = this_cpu_ptr(pcpu_flush); + flush_pcpu = this_cpu_ptr(pcpu_flush); + + if (unlikely(!*flush_pcpu)) + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } if (info->mm) { flush->address_space = virt_to_phys(info->mm->pgd); @@ -173,6 +191,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, const struct flush_tlb_info *info) { int nr_bank = 0, max_gvas, gva_n; + struct hv_flush_pcpu_ex **flush_pcpu; struct hv_flush_pcpu_ex *flush; u64 status = U64_MAX; unsigned long flags; @@ -187,7 +206,17 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, local_irq_save(flags); - flush = this_cpu_ptr(pcpu_flush_ex); + flush_pcpu = this_cpu_ptr(pcpu_flush_ex); + + if (unlikely(!*flush_pcpu)) + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } if (info->mm) { flush->address_space = virt_to_phys(info->mm->pgd); @@ -222,18 +251,18 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; status = hv_do_rep_hypercall( HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, - 0, nr_bank + 2, flush, NULL); + 0, nr_bank, flush, NULL); } else if (info->end && ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { status = hv_do_rep_hypercall( HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, - 0, nr_bank + 2, flush, NULL); + 0, nr_bank, flush, NULL); } else { gva_n = fill_gva_list(flush->gva_list, nr_bank, info->start, info->end); status = hv_do_rep_hypercall( HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, - gva_n, nr_bank + 2, flush, NULL); + gva_n, nr_bank, flush, NULL); } local_irq_restore(flags); @@ -266,7 +295,7 @@ void hyper_alloc_mmu(void) return; if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) - pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + pcpu_flush = alloc_percpu(struct hv_flush_pcpu *); else - pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *); } diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index e0bb46c02857..0e2a5edbce00 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -231,7 +231,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; - if (fpu->fpstate_active) { + if (fpu->initialized) { unsigned long fx_aligned, math_size; sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index e7636bac7372..6c98821fef5e 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -62,8 +62,10 @@ #define new_len2 145f-144f /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas uses a "true" value of -1. */ #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 1b020381ab38..ccbe24e697c4 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, void *end) alt_end_marker ":\n" /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax * - * The additional "-" is needed because gas works with s32s. + * The additional "-" is needed because gas uses a "true" value of -1. */ -#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" +#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))" /* * Pad the second replacement alternative with additional NOPs if it is @@ -218,10 +218,9 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ output, input...) \ { \ - register void *__sp asm(_ASM_SP); \ asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ "call %P[new2]", feature2) \ - : output, "+r" (__sp) \ + : output, ASM_CALL_CONSTRAINT \ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ [new2] "i" (newfunc2), ## input); \ } diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 676ee5807d86..b0dc91f4bedc 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -11,10 +11,12 @@ # define __ASM_FORM_COMMA(x) " " #x "," #endif -#ifdef CONFIG_X86_32 +#ifndef __x86_64__ +/* 32 bit */ # define __ASM_SEL(a,b) __ASM_FORM(a) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) #else +/* 64 bit */ # define __ASM_SEL(a,b) __ASM_FORM(b) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) #endif @@ -132,4 +134,15 @@ /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif +#ifndef __ASSEMBLY__ +/* + * This output constraint should be used for any inline asm which has a "call" + * instruction. Otherwise the asm may be inserted before the frame pointer + * gets set up by the containing function. If you forget to do this, objtool + * may print a "call without frame pointer save/setup" warning. + */ +register unsigned long current_stack_pointer asm(_ASM_SP); +#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) +#endif + #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 554cdb205d17..e3221ffa304e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -23,11 +23,9 @@ /* * High level FPU state handling functions: */ -extern void fpu__activate_curr(struct fpu *fpu); -extern void fpu__activate_fpstate_read(struct fpu *fpu); -extern void fpu__activate_fpstate_write(struct fpu *fpu); -extern void fpu__current_fpstate_write_begin(void); -extern void fpu__current_fpstate_write_end(void); +extern void fpu__initialize(struct fpu *fpu); +extern void fpu__prepare_read(struct fpu *fpu); +extern void fpu__prepare_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); @@ -120,20 +118,11 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu); err; \ }) -#define check_insn(insn, output, input...) \ -({ \ - int err; \ +#define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \ + : output : input) static inline int copy_fregs_to_user(struct fregs_state __user *fx) { @@ -153,20 +142,16 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) { - int err; - if (IS_ENABLED(CONFIG_X86_32)) { - err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) { - err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { /* See comment in copy_fxregs_to_kernel() below. */ - err = check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); + kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); } } - /* Copying from a kernel buffer to FPU registers should never fail: */ - WARN_ON_FPU(err); } static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) @@ -183,9 +168,7 @@ static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) static inline void copy_kernel_to_fregs(struct fregs_state *fx) { - int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - - WARN_ON_FPU(err); + kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int copy_user_to_fregs(struct fregs_state __user *fx) @@ -281,18 +264,13 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact * XSAVE area format. */ -#define XSTATE_XRESTORE(st, lmask, hmask, err) \ +#define XSTATE_XRESTORE(st, lmask, hmask) \ asm volatile(ALTERNATIVE(XRSTOR, \ XRSTORS, X86_FEATURE_XSAVES) \ "\n" \ - "xor %[err], %[err]\n" \ "3:\n" \ - ".pushsection .fixup,\"ax\"\n" \ - "4: movl $-2, %[err]\n" \ - "jmp 3b\n" \ - ".popsection\n" \ - _ASM_EXTABLE(661b, 4b) \ - : [err] "=r" (err) \ + _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\ + : \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") @@ -336,7 +314,10 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - /* We should never fault when copying from a kernel buffer: */ + /* + * We should never fault when copying from a kernel buffer, and the FPU + * state we set at boot time should be valid. + */ WARN_ON_FPU(err); } @@ -350,7 +331,7 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate) u32 hmask = mask >> 32; int err; - WARN_ON(!alternatives_patched); + WARN_ON_FPU(!alternatives_patched); XSTATE_XSAVE(xstate, lmask, hmask, err); @@ -365,12 +346,8 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; - int err; - - XSTATE_XRESTORE(xstate, lmask, hmask, err); - /* We should never fault when copying from a kernel buffer: */ - WARN_ON_FPU(err); + XSTATE_XRESTORE(xstate, lmask, hmask); } /* @@ -526,38 +503,17 @@ static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) */ static inline void fpregs_deactivate(struct fpu *fpu) { - WARN_ON_FPU(!fpu->fpregs_active); - - fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { - WARN_ON_FPU(fpu->fpregs_active); - - fpu->fpregs_active = 1; this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } /* - * The question "does this thread have fpu access?" - * is slightly racy, since preemption could come in - * and revoke it immediately after the test. - * - * However, even in that very unlikely scenario, - * we can just assume we have FPU access - typically - * to save the FP state - we'll just take a #NM - * fault and get the FPU access back. - */ -static inline int fpregs_active(void) -{ - return current->thread.fpu.fpregs_active; -} - -/* * FPU state switching for scheduling. * * This is a two-stage process: @@ -571,14 +527,13 @@ static inline int fpregs_active(void) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (old_fpu->fpregs_active) { + if (old_fpu->initialized) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else old_fpu->last_cpu = cpu; /* But leave fpu_fpregs_owner_ctx! */ - old_fpu->fpregs_active = 0; trace_x86_fpu_regs_deactivated(old_fpu); } else old_fpu->last_cpu = -1; @@ -595,7 +550,7 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { bool preload = static_cpu_has(X86_FEATURE_FPU) && - new_fpu->fpstate_active; + new_fpu->initialized; if (preload) { if (!fpregs_state_valid(new_fpu, cpu)) @@ -617,8 +572,7 @@ static inline void user_fpu_begin(void) struct fpu *fpu = ¤t->thread.fpu; preempt_disable(); - if (!fpregs_active()) - fpregs_activate(fpu); + fpregs_activate(fpu); preempt_enable(); } diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 3c80f5b9c09d..a1520575d86b 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -68,6 +68,9 @@ struct fxregs_state { /* Default value for fxregs_state.mxcsr: */ #define MXCSR_DEFAULT 0x1f80 +/* Copy both mxcsr & mxcsr_flags with a single u64 memcpy: */ +#define MXCSR_AND_FLAGS_SIZE sizeof(u64) + /* * Software based FPU emulation state. This is arbitrary really, * it matches the x87 format to make it easier to understand: @@ -290,36 +293,13 @@ struct fpu { unsigned int last_cpu; /* - * @fpstate_active: + * @initialized: * - * This flag indicates whether this context is active: if the task + * This flag indicates whether this context is initialized: if the task * is not running then we can restore from this context, if the task * is running then we should save into this context. */ - unsigned char fpstate_active; - - /* - * @fpregs_active: - * - * This flag determines whether a given context is actively - * loaded into the FPU's registers and that those registers - * represent the task's current FPU state. - * - * Note the interaction with fpstate_active: - * - * # task does not use the FPU: - * fpstate_active == 0 - * - * # task uses the FPU and regs are active: - * fpstate_active == 1 && fpregs_active == 1 - * - * # the regs are inactive but still match fpstate: - * fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu - * - * The third state is what we use for the lazy restore optimization - * on lazy-switching CPUs. - */ - unsigned char fpregs_active; + unsigned char initialized; /* * @state: diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 1b2799e0699a..83fee2469eb7 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,8 +48,12 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, - void __user *ubuf, struct xregs_state *xsave); -int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, - struct xregs_state *xsave); +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); +int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); +int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); + +/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ +extern int validate_xstate_header(const struct xstate_header *hdr); + #endif diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index bc62e7cbf1b1..59ad3d132353 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); void __init kvm_guest_init(void); -void kvm_async_pf_task_wait(u32 token); +void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); extern void kvm_disable_steal_time(void); @@ -103,7 +103,7 @@ static inline void kvm_spinlock_init(void) #else /* CONFIG_KVM_GUEST */ #define kvm_guest_init() do {} while (0) -#define kvm_async_pf_task_wait(T) do {} while(0) +#define kvm_async_pf_task_wait(T, I) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) static inline bool kvm_para_available(void) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 181264989db5..8edac1de2e35 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -187,7 +187,6 @@ struct mca_msr_regs { extern struct mce_vendor_flags mce_flags; -extern struct mca_config mca_cfg; extern struct mca_msr_regs msr_ops; enum mce_notifier_prios { diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 7ae318c340d9..3c856a15b98e 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) DEBUG_LOCKS_WARN_ON(preemptible()); } -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ - int cpu = smp_processor_id(); - - if (cpumask_test_cpu(cpu, mm_cpumask(mm))) - cpumask_clear_cpu(cpu, mm_cpumask(mm)); -} +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) @@ -286,6 +280,32 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } +/* + * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID + * bits. This serves two purposes. It prevents a nasty situation in + * which PCID-unaware code saves CR3, loads some other value (with PCID + * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if + * the saved ASID was nonzero. It also means that any bugs involving + * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger + * deterministically. + */ + +static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) +{ + if (static_cpu_has(X86_FEATURE_PCID)) { + VM_WARN_ON_ONCE(asid > 4094); + return __sme_pa(mm->pgd) | (asid + 1); + } else { + VM_WARN_ON_ONCE(asid != 0); + return __sme_pa(mm->pgd); + } +} + +static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) +{ + VM_WARN_ON_ONCE(asid > 4094); + return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; +} /* * This can be used from process context to figure out what the value of @@ -296,10 +316,8 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, */ static inline unsigned long __get_current_cr3_fast(void) { - unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); - - if (static_cpu_has(X86_FEATURE_PCID)) - cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), + this_cpu_read(cpu_tlbstate.loaded_mm_asid)); /* For now, be very restrictive about when this can be called. */ VM_WARN_ON(in_nmi() || preemptible()); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 63cc96f064dc..530f448fddaf 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -179,7 +179,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) u64 input_address = input ? virt_to_phys(input) : 0; u64 output_address = output ? virt_to_phys(output) : 0; u64 hv_status; - register void *__sp asm(_ASM_SP); #ifdef CONFIG_X86_64 if (!hv_hypercall_pg) @@ -187,7 +186,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) __asm__ __volatile__("mov %4, %%r8\n" "call *%5" - : "=a" (hv_status), "+r" (__sp), + : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) : "r" (output_address), "m" (hv_hypercall_pg) : "cc", "memory", "r8", "r9", "r10", "r11"); @@ -202,7 +201,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) __asm__ __volatile__("call *%7" : "=A" (hv_status), - "+c" (input_address_lo), "+r" (__sp) + "+c" (input_address_lo), ASM_CALL_CONSTRAINT : "A" (control), "b" (input_address_hi), "D"(output_address_hi), "S"(output_address_lo), @@ -224,12 +223,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) { u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; - register void *__sp asm(_ASM_SP); #ifdef CONFIG_X86_64 { __asm__ __volatile__("call *%4" - : "=a" (hv_status), "+r" (__sp), + : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) : "m" (hv_hypercall_pg) : "cc", "r8", "r9", "r10", "r11"); @@ -242,7 +240,7 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) __asm__ __volatile__ ("call *%5" : "=A"(hv_status), "+c"(input1_lo), - "+r"(__sp) + ASM_CALL_CONSTRAINT : "A" (control), "b" (input1_hi), "m" (hv_hypercall_pg) @@ -291,6 +289,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, * to this information. */ extern u32 *hv_vp_index; +extern u32 hv_max_vp_index; /** * hv_cpu_number_to_vp_number() - Map CPU to VP. diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 42873edd9f9d..280d94c36dad 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -459,8 +459,8 @@ int paravirt_disable_iospace(void); */ #ifdef CONFIG_X86_32 #define PVOP_VCALL_ARGS \ - unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; \ - register void *__sp asm("esp") + unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) @@ -480,8 +480,8 @@ int paravirt_disable_iospace(void); /* [re]ax isn't an arg, but the return val */ #define PVOP_VCALL_ARGS \ unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx, __eax = __eax; \ - register void *__sp asm("rsp") + __edx = __edx, __ecx = __ecx, __eax = __eax; + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) @@ -532,7 +532,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -542,7 +542,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -569,7 +569,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index ec1f3c651150..4f44505dbf87 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -100,19 +100,14 @@ static __always_inline bool should_resched(int preempt_offset) #ifdef CONFIG_PREEMPT extern asmlinkage void ___preempt_schedule(void); -# define __preempt_schedule() \ -({ \ - register void *__sp asm(_ASM_SP); \ - asm volatile ("call ___preempt_schedule" : "+r"(__sp)); \ -}) +# define __preempt_schedule() \ + asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT) extern asmlinkage void preempt_schedule(void); extern asmlinkage void ___preempt_schedule_notrace(void); -# define __preempt_schedule_notrace() \ -({ \ - register void *__sp asm(_ASM_SP); \ - asm volatile ("call ___preempt_schedule_notrace" : "+r"(__sp)); \ -}) +# define __preempt_schedule_notrace() \ + asm volatile ("call ___preempt_schedule_notrace" : ASM_CALL_CONSTRAINT) + extern asmlinkage void preempt_schedule_notrace(void); #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3fa26a61eabc..b390ff76e58f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -677,8 +677,6 @@ static inline void sync_core(void) * Like all of Linux's memory ordering operations, this is a * compiler barrier as well. */ - register void *__sp asm(_ASM_SP); - #ifdef CONFIG_X86_32 asm volatile ( "pushfl\n\t" @@ -686,7 +684,7 @@ static inline void sync_core(void) "pushl $1f\n\t" "iret\n\t" "1:" - : "+r" (__sp) : : "memory"); + : ASM_CALL_CONSTRAINT : : "memory"); #else unsigned int tmp; @@ -703,7 +701,7 @@ static inline void sync_core(void) "iretq\n\t" UNWIND_HINT_RESTORE "1:" - : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); + : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); #endif } diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index a34e0d4b957d..7116b7931c7b 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -103,7 +103,6 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) ({ \ long tmp; \ struct rw_semaphore* ret; \ - register void *__sp asm(_ASM_SP); \ \ asm volatile("# beginning down_write\n\t" \ LOCK_PREFIX " xadd %1,(%4)\n\t" \ @@ -114,7 +113,8 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) " call " slow_path "\n" \ "1:\n" \ "# ending down_write" \ - : "+m" (sem->count), "=d" (tmp), "=a" (ret), "+r" (__sp) \ + : "+m" (sem->count), "=d" (tmp), \ + "=a" (ret), ASM_CALL_CONSTRAINT \ : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \ : "memory", "cc"); \ ret; \ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 5161da1a0fa0..89e7eeb5cec1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -158,17 +158,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -static inline unsigned long current_stack_pointer(void) -{ - unsigned long sp; -#ifdef CONFIG_X86_64 - asm("mov %%rsp,%0" : "=g" (sp)); -#else - asm("mov %%esp,%0" : "=g" (sp)); -#endif - return sp; -} - /* * Walks up the stack frames to make sure that the specified object is * entirely contained by a single stack frame. diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4893abf7f74f..d362161d3291 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -83,6 +83,13 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) #endif /* + * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point + * to init_mm when we switch to a kernel thread (e.g. the idle thread). If + * it's false, then we immediately switch CR3 when entering a kernel thread. + */ +DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode); + +/* * 6 because 6 should be plenty and struct tlb_state will fit in * two cache lines. */ @@ -105,6 +112,23 @@ struct tlb_state { u16 next_asid; /* + * We can be in one of several states: + * + * - Actively using an mm. Our CPU's bit will be set in + * mm_cpumask(loaded_mm) and is_lazy == false; + * + * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit + * will not be set in mm_cpumask(&init_mm) and is_lazy == false. + * + * - Lazily using a real mm. loaded_mm != &init_mm, our bit + * is set in mm_cpumask(loaded_mm), but is_lazy == true. + * We're heuristically guessing that the CR3 load we + * skipped more than makes up for the overhead added by + * lazy mode. + */ + bool is_lazy; + + /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. */ diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 342e59789fcd..39f7a27bef13 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -12,25 +12,22 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_STRUCT__entry( __field(struct fpu *, fpu) - __field(bool, fpregs_active) - __field(bool, fpstate_active) + __field(bool, initialized) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; - __entry->fpregs_active = fpu->fpregs_active; - __entry->fpstate_active = fpu->fpstate_active; + __entry->initialized = fpu->initialized; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p initialized: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, - __entry->fpregs_active, - __entry->fpstate_active, + __entry->initialized, __entry->xfeatures, __entry->xcomp_bv ) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 184eb9894dae..4b892917edeb 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -166,11 +166,11 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) ({ \ int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ - register void *__sp asm(_ASM_SP); \ __chk_user_ptr(ptr); \ might_fault(); \ asm volatile("call __get_user_%P4" \ - : "=a" (__ret_gu), "=r" (__val_gu), "+r" (__sp) \ + : "=a" (__ret_gu), "=r" (__val_gu), \ + ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ @@ -337,7 +337,7 @@ do { \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ : "=r" (retval), "=&A"(x) \ - : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \ + : "m" (__m(__ptr)), "m" __m(((u32 __user *)(__ptr)) + 1), \ "i" (errret), "0" (retval)); \ }) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 9606688caa4b..7cb282e9e587 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -113,10 +113,9 @@ extern struct { char _entry[32]; } hypercall_page[]; register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \ register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \ register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \ - register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; \ - register void *__sp asm(_ASM_SP); + register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; -#define __HYPERCALL_0PARAM "=r" (__res), "+r" (__sp) +#define __HYPERCALL_0PARAM "=r" (__res), ASM_CALL_CONSTRAINT #define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1) #define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2) #define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3) @@ -552,13 +551,13 @@ static inline void MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { - u32 *p = (u32 *) &desc; - mcl->op = __HYPERVISOR_update_descriptor; if (sizeof(maddr) == sizeof(long)) { mcl->args[0] = maddr; mcl->args[1] = *(unsigned long *)&desc; } else { + u32 *p = (u32 *)&desc; + mcl->args[0] = maddr; mcl->args[1] = maddr >> 32; mcl->args[2] = *p++; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d705c769f77d..ff891772c9f8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -573,11 +573,21 @@ static u32 bdx_deadline_rev(void) return ~0U; } +static u32 skx_deadline_rev(void) +{ + switch (boot_cpu_data.x86_mask) { + case 0x03: return 0x01000136; + case 0x04: return 0x02000014; + } + + return ~0U; +} + static const struct x86_cpu_id deadline_match[] = { DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014), + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20), @@ -600,7 +610,8 @@ static void apic_check_deadline_errata(void) const struct x86_cpu_id *m; u32 rev; - if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) || + boot_cpu_has(X86_FEATURE_HYPERVISOR)) return; m = x86_match_cpu(deadline_match); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9862e2cd6d93..d58184b7cd44 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -763,6 +763,16 @@ static void init_amd_bd(struct cpuinfo_x86 *c) } } +static void init_amd_zn(struct cpuinfo_x86 *c) +{ + /* + * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects + * all up to and including B1. + */ + if (c->x86_model <= 1 && c->x86_mask <= 1) + set_cpu_cap(c, X86_FEATURE_CPB); +} + static void init_amd(struct cpuinfo_x86 *c) { early_init_amd(c); @@ -791,6 +801,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x10: init_amd_gh(c); break; case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; + case 0x17: init_amd_zn(c); break; } /* Enable workaround for FXSAVE leak */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index db684880d74a..0af86d9242da 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -21,14 +21,6 @@ void __init check_bugs(void) { -#ifdef CONFIG_X86_32 - /* - * Regardless of whether PCID is enumerated, the SDM says - * that it can't be enabled in 32-bit mode. - */ - setup_clear_cpu_cap(X86_FEATURE_PCID); -#endif - identify_boot_cpu(); if (!IS_ENABLED(CONFIG_SMP)) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 775f10100d7f..c9176bae7fd8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -904,6 +904,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); fpu__init_system(c); + +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif } void __init early_cpu_init(void) diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 098530a93bb7..debb974fd17d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,3 +1,6 @@ +#ifndef __X86_MCE_INTERNAL_H__ +#define __X86_MCE_INTERNAL_H__ + #include <linux/device.h> #include <asm/mce.h> @@ -108,3 +111,7 @@ static inline void mce_work_trigger(void) { } static inline void mce_register_injector_chain(struct notifier_block *nb) { } static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif + +extern struct mca_config mca_cfg; + +#endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 40e28ed77fbf..486f640b02ef 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -28,6 +28,8 @@ #include <asm/msr.h> #include <asm/trace/irq_vectors.h> +#include "mce-internal.h" + #define NR_BLOCKS 5 #define THRESHOLD_MAX 0xFFF #define INT_TYPE_APIC 0x00020000 diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 86e8f0b2537b..c4fa4a85d4cb 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -122,9 +122,6 @@ static bool __init check_loader_disabled_bsp(void) bool *res = &dis_ucode_ldr; #endif - if (!have_cpuid_p()) - return *res; - /* * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not * completely accurate as xen pv guests don't see that CPUID bit set but @@ -166,24 +163,36 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name) void __init load_ucode_bsp(void) { unsigned int cpuid_1_eax; + bool intel = true; - if (check_loader_disabled_bsp()) + if (!have_cpuid_p()) return; cpuid_1_eax = native_cpuid_eax(1); switch (x86_cpuid_vendor()) { case X86_VENDOR_INTEL: - if (x86_family(cpuid_1_eax) >= 6) - load_ucode_intel_bsp(); + if (x86_family(cpuid_1_eax) < 6) + return; break; + case X86_VENDOR_AMD: - if (x86_family(cpuid_1_eax) >= 0x10) - load_ucode_amd_bsp(cpuid_1_eax); + if (x86_family(cpuid_1_eax) < 0x10) + return; + intel = false; break; + default: - break; + return; } + + if (check_loader_disabled_bsp()) + return; + + if (intel) + load_ucode_intel_bsp(); + else + load_ucode_amd_bsp(cpuid_1_eax); } static bool check_loader_disabled_ap(void) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e1114f070c2d..f92a6593de1e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -100,7 +100,7 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); - if (fpu->fpregs_active) { + if (fpu->initialized) { /* * Ignore return value -- we don't care if reg state * is clobbered. @@ -116,7 +116,7 @@ void __kernel_fpu_end(void) { struct fpu *fpu = ¤t->thread.fpu; - if (fpu->fpregs_active) + if (fpu->initialized) copy_kernel_to_fpregs(&fpu->state); kernel_fpu_enable(); @@ -148,7 +148,7 @@ void fpu__save(struct fpu *fpu) preempt_disable(); trace_x86_fpu_before_save(fpu); - if (fpu->fpregs_active) { + if (fpu->initialized) { if (!copy_fpregs_to_fpstate(fpu)) { copy_kernel_to_fpregs(&fpu->state); } @@ -189,10 +189,9 @@ EXPORT_SYMBOL_GPL(fpstate_init); int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; - if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU)) + if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU)) return 0; WARN_ON_FPU(src_fpu != ¤t->thread.fpu); @@ -206,26 +205,14 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) /* * Save current FPU registers directly into the child * FPU context, without any memory-to-memory copying. - * In lazy mode, if the FPU context isn't loaded into - * fpregs, CR0.TS will be set and do_device_not_available - * will load the FPU context. * - * We have to do all this with preemption disabled, - * mostly because of the FNSAVE case, because in that - * case we must not allow preemption in the window - * between the FNSAVE and us marking the context lazy. - * - * It shouldn't be an issue as even FNSAVE is plenty - * fast in terms of critical section length. + * ( The function 'fails' in the FNSAVE case, which destroys + * register contents so we have to copy them back. ) */ - preempt_disable(); if (!copy_fpregs_to_fpstate(dst_fpu)) { - memcpy(&src_fpu->state, &dst_fpu->state, - fpu_kernel_xstate_size); - + memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size); copy_kernel_to_fpregs(&src_fpu->state); } - preempt_enable(); trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -237,45 +224,48 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Activate the current task's in-memory FPU context, * if it has not been used before: */ -void fpu__activate_curr(struct fpu *fpu) +void fpu__initialize(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); - if (!fpu->fpstate_active) { + if (!fpu->initialized) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); trace_x86_fpu_activate_state(fpu); /* Safe to do for the current task: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } -EXPORT_SYMBOL_GPL(fpu__activate_curr); +EXPORT_SYMBOL_GPL(fpu__initialize); /* * This function must be called before we read a task's fpstate. * - * If the task has not used the FPU before then initialize its - * fpstate. + * There's two cases where this gets called: + * + * - for the current task (when coredumping), in which case we have + * to save the latest FPU registers into the fpstate, + * + * - or it's called for stopped tasks (ptrace), in which case the + * registers were already saved by the context-switch code when + * the task scheduled out - we only have to initialize the registers + * if they've never been initialized. * * If the task has used the FPU before then save it. */ -void fpu__activate_fpstate_read(struct fpu *fpu) +void fpu__prepare_read(struct fpu *fpu) { - /* - * If fpregs are active (in the current CPU), then - * copy them to the fpstate: - */ - if (fpu->fpregs_active) { + if (fpu == ¤t->thread.fpu) { fpu__save(fpu); } else { - if (!fpu->fpstate_active) { + if (!fpu->initialized) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); trace_x86_fpu_activate_state(fpu); /* Safe to do for current and for stopped child tasks: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } } @@ -283,17 +273,17 @@ void fpu__activate_fpstate_read(struct fpu *fpu) /* * This function must be called before we write a task's fpstate. * - * If the task has used the FPU before then unlazy it. + * If the task has used the FPU before then invalidate any cached FPU registers. * If the task has not used the FPU before then initialize its fpstate. * * After this function call, after registers in the fpstate are * modified and the child task has woken up, the child task will * restore the modified FPU state from the modified context. If we - * didn't clear its lazy status here then the lazy in-registers + * didn't clear its cached status here then the cached in-registers * state pending on its former CPU could be restored, corrupting * the modifications. */ -void fpu__activate_fpstate_write(struct fpu *fpu) +void fpu__prepare_write(struct fpu *fpu) { /* * Only stopped child tasks can be used to modify the FPU @@ -301,8 +291,8 @@ void fpu__activate_fpstate_write(struct fpu *fpu) */ WARN_ON_FPU(fpu == ¤t->thread.fpu); - if (fpu->fpstate_active) { - /* Invalidate any lazy state: */ + if (fpu->initialized) { + /* Invalidate any cached state: */ __fpu_invalidate_fpregs_state(fpu); } else { fpstate_init(&fpu->state); @@ -310,74 +300,11 @@ void fpu__activate_fpstate_write(struct fpu *fpu) trace_x86_fpu_activate_state(fpu); /* Safe to do for stopped child tasks: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } /* - * This function must be called before we write the current - * task's fpstate. - * - * This call gets the current FPU register state and moves - * it in to the 'fpstate'. Preemption is disabled so that - * no writes to the 'fpstate' can occur from context - * swiches. - * - * Must be followed by a fpu__current_fpstate_write_end(). - */ -void fpu__current_fpstate_write_begin(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - - /* - * Ensure that the context-switching code does not write - * over the fpstate while we are doing our update. - */ - preempt_disable(); - - /* - * Move the fpregs in to the fpu's 'fpstate'. - */ - fpu__activate_fpstate_read(fpu); - - /* - * The caller is about to write to 'fpu'. Ensure that no - * CPU thinks that its fpregs match the fpstate. This - * ensures we will not be lazy and skip a XRSTOR in the - * future. - */ - __fpu_invalidate_fpregs_state(fpu); -} - -/* - * This function must be paired with fpu__current_fpstate_write_begin() - * - * This will ensure that the modified fpstate gets placed back in - * the fpregs if necessary. - * - * Note: This function may be called whether or not an _actual_ - * write to the fpstate occurred. - */ -void fpu__current_fpstate_write_end(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - - /* - * 'fpu' now has an updated copy of the state, but the - * registers may still be out of date. Update them with - * an XRSTOR if they are active. - */ - if (fpregs_active()) - copy_kernel_to_fpregs(&fpu->state); - - /* - * Our update is done and the fpregs/fpstate are in sync - * if necessary. Context switches can happen again. - */ - preempt_enable(); -} - -/* * 'fpu__restore()' is called to copy FPU registers from * the FPU fpstate to the live hw registers and to activate * access to the hardware registers, so that FPU instructions @@ -389,7 +316,7 @@ void fpu__current_fpstate_write_end(void) */ void fpu__restore(struct fpu *fpu) { - fpu__activate_curr(fpu); + fpu__initialize(fpu); /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); @@ -414,15 +341,17 @@ void fpu__drop(struct fpu *fpu) { preempt_disable(); - if (fpu->fpregs_active) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - fpregs_deactivate(fpu); + if (fpu == ¤t->thread.fpu) { + if (fpu->initialized) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + fpregs_deactivate(fpu); + } } - fpu->fpstate_active = 0; + fpu->initialized = 0; trace_x86_fpu_dropped(fpu); @@ -462,9 +391,11 @@ void fpu__clear(struct fpu *fpu) * Make sure fpstate is cleared and initialized. */ if (static_cpu_has(X86_FEATURE_FPU)) { - fpu__activate_curr(fpu); + preempt_disable(); + fpu__initialize(fpu); user_fpu_begin(); copy_init_fpstate_to_fpregs(); + preempt_enable(); } } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index d5d44c452624..7affb7e3d9a5 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -240,7 +240,7 @@ static void __init fpu__init_system_ctx_switch(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - WARN_ON_FPU(current->thread.fpu.fpstate_active); + WARN_ON_FPU(current->thread.fpu.initialized); } /* diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index b188b16841e3..3ea151372389 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -16,14 +16,14 @@ int regset_fpregs_active(struct task_struct *target, const struct user_regset *r { struct fpu *target_fpu = &target->thread.fpu; - return target_fpu->fpstate_active ? regset->n : 0; + return target_fpu->initialized ? regset->n : 0; } int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { struct fpu *target_fpu = &target->thread.fpu; - if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->fpstate_active) + if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized) return regset->n; else return 0; @@ -38,7 +38,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); fpstate_sanitize_xstate(fpu); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, @@ -55,7 +55,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); fpstate_sanitize_xstate(fpu); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, @@ -89,10 +89,13 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, xsave = &fpu->state.xsave; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); if (using_compacted_format()) { - ret = copyout_from_xsaves(pos, count, kbuf, ubuf, xsave); + if (kbuf) + ret = copy_xstate_to_kernel(kbuf, xsave, pos, count); + else + ret = copy_xstate_to_user(ubuf, xsave, pos, count); } else { fpstate_sanitize_xstate(fpu); /* @@ -129,28 +132,29 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, xsave = &fpu->state.xsave; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - ret = copyin_to_xsaves(kbuf, ubuf, xsave); - else + if (using_compacted_format()) { + if (kbuf) + ret = copy_kernel_to_xstate(xsave, kbuf); + else + ret = copy_user_to_xstate(xsave, ubuf); + } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - - /* - * In case of failure, mark all states as init: - */ - if (ret) - fpstate_init(&fpu->state); + if (!ret) + ret = validate_xstate_header(&xsave->header); + } /* * mxcsr reserved bits must be masked to zero for security reasons. */ xsave->i387.mxcsr &= mxcsr_feature_mask; - xsave->header.xfeatures &= xfeatures_mask; + /* - * These bits must be zero. + * In case of failure, mark all states as init: */ - memset(&xsave->header.reserved, 0, 48); + if (ret) + fpstate_init(&fpu->state); return ret; } @@ -299,7 +303,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); if (!boot_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); @@ -329,7 +333,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); fpstate_sanitize_xstate(fpu); if (!boot_cpu_has(X86_FEATURE_FPU)) @@ -369,7 +373,7 @@ int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) struct fpu *fpu = &tsk->thread.fpu; int fpvalid; - fpvalid = fpu->fpstate_active; + fpvalid = fpu->initialized; if (fpvalid) fpvalid = !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct), diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 83c23c230b4c..fb639e70048f 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -155,7 +155,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) */ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { - struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; + struct fpu *fpu = ¤t->thread.fpu; + struct xregs_state *xsave = &fpu->state.xsave; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); @@ -170,13 +171,13 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpregs_active() || using_compacted_format()) { + if (fpu->initialized || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; /* Update the thread's fxstate to save the fsave header. */ if (ia32_fxstate) - copy_fxregs_to_kernel(&tsk->thread.fpu); + copy_fxregs_to_kernel(fpu); } else { /* * It is a *bug* if kernel uses compacted-format for xsave @@ -189,7 +190,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) return -1; } - fpstate_sanitize_xstate(&tsk->thread.fpu); + fpstate_sanitize_xstate(fpu); if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) return -1; } @@ -213,8 +214,11 @@ sanitize_restored_xstate(struct task_struct *tsk, struct xstate_header *header = &xsave->header; if (use_xsave()) { - /* These bits must be zero. */ - memset(header->reserved, 0, 48); + /* + * Note: we don't need to zero the reserved bits in the + * xstate_header here because we either didn't copy them at all, + * or we checked earlier that they aren't set. + */ /* * Init the state that is not present in the memory @@ -223,7 +227,7 @@ sanitize_restored_xstate(struct task_struct *tsk, if (fx_only) header->xfeatures = XFEATURE_MASK_FPSSE; else - header->xfeatures &= (xfeatures_mask & xfeatures); + header->xfeatures &= xfeatures; } if (use_fxsr()) { @@ -279,7 +283,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(VERIFY_READ, buf, size)) return -EACCES; - fpu__activate_curr(fpu); + fpu__initialize(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(current, NULL, @@ -307,28 +311,29 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) /* * For 32-bit frames with fxstate, copy the user state to the * thread's fpu state, reconstruct fxstate from the fsave - * header. Sanitize the copied state etc. + * header. Validate and sanitize the copied state. */ struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; int err = 0; /* - * Drop the current fpu which clears fpu->fpstate_active. This ensures + * Drop the current fpu which clears fpu->initialized. This ensures * that any context-switch during the copy of the new state, * avoids the intermediate state from getting restored/saved. * Thus avoiding the new restored state from getting corrupted. * We will be ready to restore/save the state only after - * fpu->fpstate_active is again set. + * fpu->initialized is again set. */ fpu__drop(fpu); if (using_compacted_format()) { - err = copyin_to_xsaves(NULL, buf_fx, - &fpu->state.xsave); + err = copy_user_to_xstate(&fpu->state.xsave, buf_fx); } else { - err = __copy_from_user(&fpu->state.xsave, - buf_fx, state_size); + err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); + + if (!err && state_size > offsetof(struct xregs_state, header)) + err = validate_xstate_header(&fpu->state.xsave.header); } if (err || __copy_from_user(&env, buf, sizeof(env))) { @@ -339,7 +344,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); } - fpu->fpstate_active = 1; + fpu->initialized = 1; preempt_disable(); fpu__restore(fpu); preempt_enable(); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c24ac1efb12d..f1d5476c9022 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -483,6 +483,30 @@ int using_compacted_format(void) return boot_cpu_has(X86_FEATURE_XSAVES); } +/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ +int validate_xstate_header(const struct xstate_header *hdr) +{ + /* No unknown or supervisor features may be set */ + if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR)) + return -EINVAL; + + /* Userspace must use the uncompacted format */ + if (hdr->xcomp_bv) + return -EINVAL; + + /* + * If 'reserved' is shrunken to add a new field, make sure to validate + * that new field here! + */ + BUILD_BUG_ON(sizeof(hdr->reserved) != 48); + + /* No reserved bits may be set */ + if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) + return -EINVAL; + + return 0; +} + static void __xstate_dump_leaves(void) { int i; @@ -867,7 +891,7 @@ const void *get_xsave_field_ptr(int xsave_state) { struct fpu *fpu = ¤t->thread.fpu; - if (!fpu->fpstate_active) + if (!fpu->initialized) return NULL; /* * fpu__save() takes the CPU's xstate registers @@ -921,38 +945,129 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, #endif /* ! CONFIG_ARCH_HAS_PKEYS */ /* + * Weird legacy quirk: SSE and YMM states store information in the + * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP + * area is marked as unused in the xfeatures header, we need to copy + * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use. + */ +static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) +{ + if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM))) + return false; + + if (xfeatures & XFEATURE_MASK_FP) + return false; + + return true; +} + +/* * This is similar to user_regset_copyout(), but will not add offset to * the source data pointer or increment pos, count, kbuf, and ubuf. */ -static inline int xstate_copyout(unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, - const void *data, const int start_pos, - const int end_pos) +static inline void +__copy_xstate_to_kernel(void *kbuf, const void *data, + unsigned int offset, unsigned int size, unsigned int size_total) { - if ((count == 0) || (pos < start_pos)) - return 0; + if (offset < size_total) { + unsigned int copy = min(size, size_total - offset); - if (end_pos < 0 || pos < end_pos) { - unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + memcpy(kbuf + offset, data, copy); + } +} - if (kbuf) { - memcpy(kbuf + pos, data, copy); - } else { - if (__copy_to_user(ubuf + pos, data, copy)) - return -EFAULT; +/* + * Convert from kernel XSAVES compacted format to standard format and copy + * to a kernel-space ptrace buffer. + * + * It supports partial copy but pos always starts from zero. This is called + * from xstateregs_get() and there we check the CPU has XSAVES. + */ +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) +{ + unsigned int offset, size; + struct xstate_header header; + int i; + + /* + * Currently copy_regset_to_user() starts from pos 0: + */ + if (unlikely(offset_start != 0)) + return -EFAULT; + + /* + * The destination is a ptrace buffer; we put in only user xstates: + */ + memset(&header, 0, sizeof(header)); + header.xfeatures = xsave->header.xfeatures; + header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + + /* + * Copy xregs_state->header: + */ + offset = offsetof(struct xregs_state, header); + size = sizeof(header); + + __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); + + for (i = 0; i < XFEATURE_MAX; i++) { + /* + * Copy only in-use xstates: + */ + if ((header.xfeatures >> i) & 1) { + void *src = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + /* The next component has to fit fully into the output buffer: */ + if (offset + size > size_total) + break; + + __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); } + + } + + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total); + } + + /* + * Fill xsave->i387.sw_reserved value for ptrace frame: + */ + offset = offsetof(struct fxregs_state, sw_reserved); + size = sizeof(xstate_fx_sw_bytes); + + __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); + + return 0; +} + +static inline int +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total) +{ + if (!size) + return 0; + + if (offset < size_total) { + unsigned int copy = min(size, size_total - offset); + + if (__copy_to_user(ubuf + offset, data, copy)) + return -EFAULT; } return 0; } /* * Convert from kernel XSAVES compacted format to standard format and copy - * to a ptrace buffer. It supports partial copy but pos always starts from + * to a user-space buffer. It supports partial copy but pos always starts from * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, - void __user *ubuf, struct xregs_state *xsave) +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { unsigned int offset, size; int ret, i; @@ -961,7 +1076,7 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, /* * Currently copy_regset_to_user() starts from pos 0: */ - if (unlikely(pos != 0)) + if (unlikely(offset_start != 0)) return -EFAULT; /* @@ -977,8 +1092,7 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count); - + ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total); if (ret) return ret; @@ -992,25 +1106,30 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count); + /* The next component has to fit fully into the output buffer: */ + if (offset + size > size_total) + break; + ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total); if (ret) return ret; - - if (offset + size >= count) - break; } } + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total); + } + /* * Fill xsave->i387.sw_reserved value for ptrace frame: */ offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); - + ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total); if (ret) return ret; @@ -1018,55 +1137,98 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, } /* - * Convert from a ptrace standard-format buffer to kernel XSAVES format - * and copy to the target thread. This is called from xstateregs_set() and - * there we check the CPU has XSAVES and a whole standard-sized buffer - * exists. + * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format + * and copy to the target thread. This is called from xstateregs_set(). */ -int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, - struct xregs_state *xsave) +int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) { unsigned int offset, size; int i; - u64 xfeatures; - u64 allowed_features; + struct xstate_header hdr; offset = offsetof(struct xregs_state, header); - size = sizeof(xfeatures); + size = sizeof(hdr); - if (kbuf) { - memcpy(&xfeatures, kbuf + offset, size); - } else { - if (__copy_from_user(&xfeatures, ubuf + offset, size)) - return -EFAULT; + memcpy(&hdr, kbuf + offset, size); + + if (validate_xstate_header(&hdr)) + return -EINVAL; + + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = ((u64)1 << i); + + if (hdr.xfeatures & mask) { + void *dst = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + memcpy(dst, kbuf + offset, size); + } + } + + if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + memcpy(&xsave->i387.mxcsr, kbuf + offset, size); } /* - * Reject if the user sets any disabled or supervisor features: + * The state that came in from userspace was user-state only. + * Mask all the user states out of 'xfeatures': + */ + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + + /* + * Add back in the features that came in from userspace: */ - allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; + xsave->header.xfeatures |= hdr.xfeatures; - if (xfeatures & ~allowed_features) + return 0; +} + +/* + * Convert from a ptrace or sigreturn standard-format user-space buffer to + * kernel XSAVES format and copy to the target thread. This is called from + * xstateregs_set(), as well as potentially from the sigreturn() and + * rt_sigreturn() system calls. + */ +int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) +{ + unsigned int offset, size; + int i; + struct xstate_header hdr; + + offset = offsetof(struct xregs_state, header); + size = sizeof(hdr); + + if (__copy_from_user(&hdr, ubuf + offset, size)) + return -EFAULT; + + if (validate_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { u64 mask = ((u64)1 << i); - if (xfeatures & mask) { + if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, 1 << i); offset = xstate_offsets[i]; size = xstate_sizes[i]; - if (kbuf) { - memcpy(dst, kbuf + offset, size); - } else { - if (__copy_from_user(dst, ubuf + offset, size)) - return -EFAULT; - } + if (__copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; } } + if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) + return -EFAULT; + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': @@ -1076,7 +1238,7 @@ int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, /* * Add back in the features that came in from userspace: */ - xsave->header.xfeatures |= xfeatures; + xsave->header.xfeatures |= hdr.xfeatures; return 0; } diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 1f38d9a4d9de..d4eb450144fd 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -64,7 +64,7 @@ static void call_on_stack(void *func, void *stack) static inline void *current_stack(void) { - return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); + return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); } static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) @@ -88,7 +88,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) /* Save the next esp at the bottom of the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer(); + *prev_esp = current_stack_pointer; if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -139,7 +139,7 @@ void do_softirq_own_stack(void) /* Push the previous esp onto the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer(); + *prev_esp = current_stack_pointer; call_on_stack(__do_softirq, isp); } diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index db2182d63ed0..3fc0f9a794cb 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -3,6 +3,15 @@ /* Kprobes and Optprobes common header */ +#include <asm/asm.h> + +#ifdef CONFIG_FRAME_POINTER +# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \ + " mov %" _ASM_SP ", %" _ASM_BP "\n" +#else +# define SAVE_RBP_STRING " push %" _ASM_BP "\n" +#endif + #ifdef CONFIG_X86_64 #define SAVE_REGS_STRING \ /* Skip cs, ip, orig_ax. */ \ @@ -17,7 +26,7 @@ " pushq %r10\n" \ " pushq %r11\n" \ " pushq %rbx\n" \ - " pushq %rbp\n" \ + SAVE_RBP_STRING \ " pushq %r12\n" \ " pushq %r13\n" \ " pushq %r14\n" \ @@ -48,7 +57,7 @@ " pushl %es\n" \ " pushl %ds\n" \ " pushl %eax\n" \ - " pushl %ebp\n" \ + SAVE_RBP_STRING \ " pushl %edi\n" \ " pushl %esi\n" \ " pushl %edx\n" \ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index f0153714ddac..0742491cbb73 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1080,8 +1080,6 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) * raw stack chunk with redzones: */ __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr)); - regs->flags &= ~X86_EFLAGS_IF; - trace_hardirqs_off(); regs->ip = (unsigned long)(jp->entry); /* diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 4b0592ca9e47..8c1cc08f514f 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -299,7 +299,7 @@ static int __init create_setup_data_nodes(struct kobject *parent) return 0; out_clean_nodes: - for (j = i - 1; j > 0; j--) + for (j = i - 1; j >= 0; j--) cleanup_setup_data_node(*(kobjp + j)); kfree(kobjp); out_setup_data_kobj: diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index aa60a08b65b1..8bb9594d0761 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -117,7 +117,11 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, return NULL; } -void kvm_async_pf_task_wait(u32 token) +/* + * @interrupt_kernel: Is this called from a routine which interrupts the kernel + * (other than user space)? + */ +void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; @@ -140,7 +144,10 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || preempt_count() > 1; + n.halted = is_idle_task(current) || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) + ? preempt_count() > 1 || rcu_preempt_depth() + : interrupt_kernel); init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); raw_spin_unlock(&b->lock); @@ -268,7 +275,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ prev_state = exception_enter(); - kvm_async_pf_task_wait((u32)read_cr2()); + kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs)); exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 54180fa6f66f..add33f600531 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -105,6 +105,10 @@ void __noreturn machine_real_restart(unsigned int type) load_cr3(initial_page_table); #else write_cr3(real_mode_header->trampoline_pgd); + + /* Exiting long mode will fail if CR4.PCIDE is set. */ + if (static_cpu_has(X86_FEATURE_PCID)) + cr4_clear_bits(X86_CR4_PCIDE); #endif /* Jump to the identity-mapped low memory code */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e04442345fc0..4e188fda5961 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -263,7 +263,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = (unsigned long) ka->sa.sa_restorer; } - if (fpu->fpstate_active) { + if (fpu->initialized) { sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), &buf_fx, &math_size); *fpstate = (void __user *)sp; @@ -279,7 +279,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; /* save i387 and extended state */ - if (fpu->fpstate_active && + if (fpu->initialized && copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0) return (void __user *)-1L; @@ -755,7 +755,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - if (fpu->fpstate_active) + if (fpu->initialized) fpu__clear(fpu); } signal_setup_done(failed, ksig, stepping); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0854ff169274..ad59edd84de7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -232,12 +232,6 @@ static void notrace start_secondary(void *unused) */ if (boot_cpu_has(X86_FEATURE_PCID)) __write_cr4(__read_cr4() | X86_CR4_PCIDE); - cpu_init(); - x86_cpuinit.early_percpu_clock_init(); - preempt_disable(); - smp_callin(); - - enable_start_cpu0 = 0; #ifdef CONFIG_X86_32 /* switch away from the initial page table */ @@ -245,6 +239,13 @@ static void notrace start_secondary(void *unused) __flush_tlb_all(); #endif + cpu_init(); + x86_cpuinit.early_percpu_clock_init(); + preempt_disable(); + smp_callin(); + + enable_start_cpu0 = 0; + /* otherwise gcc will move up smp_processor_id before the cpu_init */ barrier(); /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 34ea3651362e..67db4f43309e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -142,7 +142,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) * from double_fault. */ BUG_ON((unsigned long)(current_top_of_stack() - - current_stack_pointer()) >= THREAD_SIZE); + current_stack_pointer) >= THREAD_SIZE); preempt_enable_no_resched(); } diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index d145a0b1f529..3dc26f95d46e 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -44,7 +44,8 @@ static void unwind_dump(struct unwind_state *state) state->stack_info.type, state->stack_info.next_sp, state->stack_mask, state->graph_idx); - for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp; + sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) break; @@ -174,6 +175,7 @@ static bool is_last_task_frame(struct unwind_state *state) * This determines if the frame pointer actually contains an encoded pointer to * pt_regs on the stack. See ENCODE_FRAME_POINTER. */ +#ifdef CONFIG_X86_64 static struct pt_regs *decode_frame_pointer(unsigned long *bp) { unsigned long regs = (unsigned long)bp; @@ -183,6 +185,23 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) return (struct pt_regs *)(regs & ~0x1); } +#else +static struct pt_regs *decode_frame_pointer(unsigned long *bp) +{ + unsigned long regs = (unsigned long)bp; + + if (regs & 0x80000000) + return NULL; + + return (struct pt_regs *)(regs | 0x80000000); +} +#endif + +#ifdef CONFIG_X86_32 +#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long)) +#else +#define KERNEL_REGS_SIZE (sizeof(struct pt_regs)) +#endif static bool update_stack_state(struct unwind_state *state, unsigned long *next_bp) @@ -202,7 +221,7 @@ static bool update_stack_state(struct unwind_state *state, regs = decode_frame_pointer(next_bp); if (regs) { frame = (unsigned long *)regs; - len = regs_size(regs); + len = KERNEL_REGS_SIZE; state->got_irq = true; } else { frame = next_bp; @@ -226,6 +245,14 @@ static bool update_stack_state(struct unwind_state *state, frame < prev_frame_end) return false; + /* + * On 32-bit with user mode regs, make sure the last two regs are safe + * to access: + */ + if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) && + !on_stack(info, frame, len + 2*sizeof(long))) + return false; + /* Move state to the next frame: */ if (regs) { state->regs = regs; @@ -328,6 +355,13 @@ bad_address: state->regs->sp < (unsigned long)task_pt_regs(state->task)) goto the_end; + /* + * There are some known frame pointer issues on 32-bit. Disable + * unwinder warnings on 32-bit until it gets objtool support. + */ + if (IS_ENABLED(CONFIG_X86_32)) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3ea624452f93..3c48bc8bf08c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -23,6 +23,7 @@ config KVM depends on HIGH_RES_TIMERS # for TASKSTATS/TASK_DELAY_ACCT: depends on NET && MULTIUSER + depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 16bf6655aa85..d90cdc77e077 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -425,8 +425,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #op " %al \n\t" \ FOP_RET -asm(".global kvm_fastop_exception \n" - "kvm_fastop_exception: xor %esi, %esi; ret"); +asm(".pushsection .fixup, \"ax\"\n" + ".global kvm_fastop_exception \n" + "kvm_fastop_exception: xor %esi, %esi; ret\n" + ".popsection"); FOP_START(setcc) FOP_SETCC(seto) @@ -4102,10 +4104,12 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (efer & EFER_LMA) { u64 maxphyaddr; - u32 eax = 0x80000008; + u32 eax, ebx, ecx, edx; - if (ctxt->ops->get_cpuid(ctxt, &eax, NULL, NULL, - NULL, false)) + eax = 0x80000008; + ecx = 0; + if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, + &edx, false)) maxphyaddr = eax & 0xff; else maxphyaddr = 36; @@ -5296,7 +5300,6 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) { - register void *__sp asm(_ASM_SP); ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; if (!(ctxt->d & ByteOp)) @@ -5304,7 +5307,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), - [fastop]"+S"(fop), "+r"(__sp) + [fastop]"+S"(fop), ASM_CALL_CONSTRAINT : "c"(ctxt->src2.val)); ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index eca30c1eb1d9..7a69cf053711 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3837,7 +3837,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, case KVM_PV_REASON_PAGE_NOT_PRESENT: vcpu->arch.apf.host_apf_reason = 0; local_irq_disable(); - kvm_async_pf_task_wait(fault_address); + kvm_async_pf_task_wait(fault_address, 0); local_irq_enable(); break; case KVM_PV_REASON_PAGE_READY: @@ -3974,19 +3974,19 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) { /* - * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set - * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means - * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. - */ - gpte |= level - PT_PAGE_TABLE_LEVEL - 1; - - /* * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. * If it is clear, there are no large pages at this level, so clear * PT_PAGE_SIZE_MASK in gpte if that is the case. */ gpte &= level - mmu->last_nonleaf_level; + /* + * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set + * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means + * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. + */ + gpte |= level - PT_PAGE_TABLE_LEVEL - 1; + return gpte & PT_PAGE_SIZE_MASK; } @@ -4555,6 +4555,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); + update_last_nonleaf_level(vcpu, context); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 86b68dc5a649..f18d1f8d332b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -334,10 +334,11 @@ retry_walk: --walker->level; index = PT_INDEX(addr, walker->level); - table_gfn = gpte_to_gfn(pte); offset = index * sizeof(pt_element_t); pte_gpa = gfn_to_gpa(table_gfn) + offset; + + BUG_ON(walker->level < 1); walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06c0c6d0541e..95a01609d7ee 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -200,6 +200,8 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; + unsigned long vmcs_host_cr3; /* May not match real cr3 */ + unsigned long vmcs_host_cr4; /* May not match real cr4 */ struct list_head loaded_vmcss_on_cpu_link; }; @@ -600,8 +602,6 @@ struct vcpu_vmx { int gs_ldt_reload_needed; int fs_reload_needed; u64 msr_host_bndcfgs; - unsigned long vmcs_host_cr3; /* May not match real cr3 */ - unsigned long vmcs_host_cr4; /* May not match real cr4 */ } host_state; struct { int vm86_active; @@ -2202,46 +2202,44 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) struct pi_desc old, new; unsigned int dest; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + /* + * In case of hot-plug or hot-unplug, we may have to undo + * vmx_vcpu_pi_put even if there is no assigned device. And we + * always keep PI.NDST up to date for simplicity: it makes the + * code easier, and CPU migration is not a fast path. + */ + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + return; + + /* + * First handle the simple case where no cmpxchg is necessary; just + * allow posting non-urgent interrupts. + * + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block will do it for us and the wakeup_handler + * expects the VCPU to be on the blocked_vcpu_list that matches + * PI.NDST. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || + vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); return; + } + /* The full case. */ do { old.control = new.control = pi_desc->control; - /* - * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there - * are two possible cases: - * 1. After running 'pre_block', context switch - * happened. For this case, 'sn' was set in - * vmx_vcpu_put(), so we need to clear it here. - * 2. After running 'pre_block', we were blocked, - * and woken up by some other guy. For this case, - * we don't need to do anything, 'pi_post_block' - * will do everything for us. However, we cannot - * check whether it is case #1 or case #2 here - * (maybe, not needed), so we also clear sn here, - * I think it is not a big deal. - */ - if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { - if (vcpu->cpu != cpu) { - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - } + dest = cpu_physical_id(cpu); - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; - /* Allow posting non-urgent interrupts */ new.sn = 0; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); } static void decache_tsc_multiplier(struct vcpu_vmx *vmx) @@ -5077,21 +5075,30 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; if (vcpu->mode == IN_GUEST_MODE) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * Currently, we don't support urgent interrupt, - * all interrupts are recognized as non-urgent - * interrupt, so we cannot post interrupts when - * 'SN' is set. + * The vector of interrupt to be delivered to vcpu had + * been set in PIR before this function. + * + * Following cases will be reached in this block, and + * we always send a notification event in all cases as + * explained below. + * + * Case 1: vcpu keeps in non-root mode. Sending a + * notification event posts the interrupt to vcpu. * - * If the vcpu is in guest mode, it means it is - * running instead of being scheduled out and - * waiting in the run queue, and that's the only - * case when 'SN' is set currently, warning if - * 'SN' is set. + * Case 2: vcpu exits to root mode and is still + * runnable. PIR will be synced to vIRR before the + * next vcpu entry. Sending a notification event in + * this case has no effect, as vcpu is not in root + * mode. + * + * Case 3: vcpu exits to root mode and is blocked. + * vcpu_block() has already synced PIR to vIRR and + * never blocks vcpu if vIRR is not cleared. Therefore, + * a blocked vcpu here does not wait for any requested + * interrupts in PIR, and sending a notification event + * which has no effect is safe here. */ - WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); return true; @@ -5169,12 +5176,12 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) */ cr3 = __read_cr3(); vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ - vmx->host_state.vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->vmcs_host_cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ - vmx->host_state.vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->vmcs_host_cr4 = cr4; vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 @@ -9036,7 +9043,6 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) { u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - register void *__sp asm(_ASM_SP); if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { @@ -9065,7 +9071,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 [sp]"=&r"(tmp), #endif - "+r"(__sp) + ASM_CALL_CONSTRAINT : [entry]"r"(entry), [ss]"i"(__KERNEL_DS), @@ -9265,15 +9271,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) { + if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) { vmcs_writel(HOST_CR3, cr3); - vmx->host_state.vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->vmcs_host_cr3 = cr3; } cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { + if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) { vmcs_writel(HOST_CR4, cr4); - vmx->host_state.vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->vmcs_host_cr4 = cr4; } /* When single-stepping over STI and MOV SS, we must clear the @@ -9583,6 +9589,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; + return &vmx->vcpu; free_vmcs: @@ -9831,7 +9844,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, WARN_ON(!is_guest_mode(vcpu)); - if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) { + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && + !to_vmx(vcpu)->nested.nested_run_pending) { vmcs12->vm_exit_intr_error_code = fault->error_code; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | @@ -11283,7 +11297,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, /* Same as above - no reason to call set_cr4_guest_host_mask(). */ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); - kvm_set_cr4(vcpu, vmcs12->host_cr4); + vmx_set_cr4(vcpu, vmcs12->host_cr4); nested_ept_uninit_mmu_context(vcpu); @@ -11696,6 +11710,37 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +static void __pi_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + do { + old.control = new.control = pi_desc->control; + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the VCPU is blocked\n"); + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + vcpu->pre_pcpu = -1; + } +} + /* * This routine does the following things for vCPU which is going * to be blocked if VT-d PI is enabled. @@ -11711,7 +11756,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, */ static int pi_pre_block(struct kvm_vcpu *vcpu) { - unsigned long flags; unsigned int dest; struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -11721,34 +11765,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) !kvm_vcpu_apicv_active(vcpu)) return 0; - vcpu->pre_pcpu = vcpu->cpu; - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + WARN_ON(irqs_disabled()); + local_irq_disable(); + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + } do { old.control = new.control = pi_desc->control; - /* - * We should not block the vCPU if - * an interrupt is posted for it. - */ - if (pi_test_on(pi_desc) == 1) { - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - - return 1; - } - WARN((pi_desc->sn == 1), "Warning: SN field of posted-interrupts " "is set before blocking\n"); @@ -11770,10 +11800,15 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'wakeup vector' */ new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); - return 0; + /* We should not block the vCPU if an interrupt is posted for it. */ + if (pi_test_on(pi_desc) == 1) + __pi_post_block(vcpu); + + local_irq_enable(); + return (vcpu->pre_pcpu == -1); } static int vmx_pre_block(struct kvm_vcpu *vcpu) @@ -11789,44 +11824,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) static void pi_post_block(struct kvm_vcpu *vcpu) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - unsigned long flags; - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (vcpu->pre_pcpu == -1) return; - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* Allow posting non-urgent interrupts */ - new.sn = 0; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); - - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - } + WARN_ON(irqs_disabled()); + local_irq_disable(); + __pi_post_block(vcpu); + local_irq_enable(); } static void vmx_post_block(struct kvm_vcpu *vcpu) @@ -11911,12 +11915,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, if (set) ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else { - /* suppress notification event before unposting */ - pi_set_sn(vcpu_to_pi_desc(vcpu)); + else ret = irq_set_vcpu_affinity(host_irq, NULL); - pi_clear_sn(vcpu_to_pi_desc(vcpu)); - } if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cd17b7d9a107..03869eb7fcd6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7225,7 +7225,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - fpu__activate_curr(fpu); + fpu__initialize(fpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index d4a7df2205b8..220638a4cb94 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -114,7 +114,7 @@ void math_emulate(struct math_emu_info *info) struct desc_struct code_descriptor; struct fpu *fpu = ¤t->thread.fpu; - fpu__activate_curr(fpu); + fpu__initialize(fpu); #ifdef RE_ENTRANT_CHECKING if (emulating) { diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 72bf8c01c6e3..e1f095884386 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,12 @@ -# Kernel does not boot with instrumentation of tlb.c. -KCOV_INSTRUMENT_tlb.o := n +# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_mem_encrypt.o := n + +KASAN_SANITIZE_mem_encrypt.o := n + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_mem_encrypt.o = -pg +endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o physaddr.o setup_nx.o tlb.o diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index c076f710de4c..c3521e2be396 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -2,6 +2,7 @@ #include <linux/uaccess.h> #include <linux/sched/debug.h> +#include <asm/fpu/internal.h> #include <asm/traps.h> #include <asm/kdebug.h> @@ -78,6 +79,29 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_refcount); +/* + * Handler for when we fail to restore a task's FPU state. We should never get + * here because the FPU state of a task using the FPU (task->thread.fpu.state) + * should always be valid. However, past bugs have allowed userspace to set + * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). + * These caused XRSTOR to fail when switching to the task, leaking the FPU + * registers of the task previously executing on the CPU. Mitigate this class + * of vulnerability by restoring from the initial state (essentially, zeroing + * out all the FPU registers) if we can't restore from the task's FPU state. + */ +bool ex_handler_fprestore(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ip = ex_fixup_addr(fixup); + + WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", + (void *)instruction_pointer(regs)); + + __copy_kernel_to_fpregs(&init_fpstate, -1); + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_fprestore); + bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b836a7274e12..e2baeaa053a5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -192,8 +192,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ -static void fill_sig_info_pkey(int si_code, siginfo_t *info, - struct vm_area_struct *vma) +static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) { /* This is effectively an #ifdef */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) @@ -209,7 +208,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * valid VMA, so we should never reach this without a * valid VMA. */ - if (!vma) { + if (!pkey) { WARN_ONCE(1, "PKU fault with no VMA passed in"); info->si_pkey = 0; return; @@ -219,13 +218,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * absolutely guranteed to be 100% accurate because of * the race explained above. */ - info->si_pkey = vma_pkey(vma); + info->si_pkey = *pkey; } static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk, struct vm_area_struct *vma, - int fault) + struct task_struct *tsk, u32 *pkey, int fault) { unsigned lsb = 0; siginfo_t info; @@ -240,7 +238,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, lsb = PAGE_SHIFT; info.si_addr_lsb = lsb; - fill_sig_info_pkey(si_code, &info, vma); + fill_sig_info_pkey(si_code, &info, pkey); force_sig_info(si_signo, &info, tsk); } @@ -762,8 +760,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; unsigned long flags; int sig; - /* No context means no VMA to pass down */ - struct vm_area_struct *vma = NULL; /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF)) { @@ -788,7 +784,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* XXX: hwpoison faults will set the wrong code. */ force_sig_info_fault(signal, si_code, address, - tsk, vma, 0); + tsk, NULL, 0); } /* @@ -806,7 +802,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - register void *__sp asm("rsp"); unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); /* * We're likely to be running with very little stack space @@ -821,7 +816,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, asm volatile ("movq %[stack], %%rsp\n\t" "call handle_stack_overflow\n\t" "1: jmp 1b" - : "+r" (__sp) + : ASM_CALL_CONSTRAINT : "D" ("kernel stack overflow (page fault)"), "S" (regs), "d" (address), [stack] "rm" (stack)); @@ -897,8 +892,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - int si_code) + unsigned long address, u32 *pkey, int si_code) { struct task_struct *tsk = current; @@ -946,7 +940,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_PF; - force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); return; } @@ -959,9 +953,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma) + unsigned long address, u32 *pkey) { - __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR); + __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); } static void @@ -969,6 +963,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct vm_area_struct *vma, int si_code) { struct mm_struct *mm = current->mm; + u32 pkey; + + if (vma) + pkey = vma_pkey(vma); /* * Something tried to access memory that isn't in our memory map.. @@ -976,7 +974,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, */ up_read(&mm->mmap_sem); - __bad_area_nosemaphore(regs, error_code, address, vma, si_code); + __bad_area_nosemaphore(regs, error_code, address, + (vma) ? &pkey : NULL, si_code); } static noinline void @@ -1019,7 +1018,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, - struct vm_area_struct *vma, unsigned int fault) + u32 *pkey, unsigned int fault) { struct task_struct *tsk = current; int code = BUS_ADRERR; @@ -1046,13 +1045,12 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault); + force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); } static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - unsigned int fault) + unsigned long address, u32 *pkey, unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -1076,9 +1074,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, vma, fault); + do_sigbus(regs, error_code, address, pkey, fault); else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address, vma); + bad_area_nosemaphore(regs, error_code, address, pkey); else BUG(); } @@ -1268,6 +1266,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm; int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + u32 pkey; tsk = current; mm = tsk->mm; @@ -1468,9 +1467,10 @@ good_area: return; } + pkey = vma_pkey(vma); up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, vma, fault); + mm_fault_error(regs, error_code, address, &pkey, fault); return; } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 3fcc8e01683b..16c5f37933a2 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -10,6 +10,8 @@ * published by the Free Software Foundation. */ +#define DISABLE_BRANCH_PROFILING + #include <linux/linkage.h> #include <linux/init.h> #include <linux/mm.h> diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 2dab69a706ec..d7bc0eea20a5 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -18,7 +18,6 @@ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/mmu_context.h> /* vma_pkey() */ -#include <asm/fpu/internal.h> /* fpregs_active() */ int __execute_only_pkey(struct mm_struct *mm) { @@ -45,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm) */ preempt_disable(); if (!need_to_set_mm_pkey && - fpregs_active() && + current->thread.fpu.initialized && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); return execute_only_pkey; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 1ab3821f9e26..658bf0090565 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -30,6 +30,8 @@ atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); +DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode); + static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, u16 *new_asid, bool *need_flush) { @@ -80,7 +82,7 @@ void leave_mm(int cpu) return; /* Warn if we're not lazy. */ - WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); + WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); switch_mm(NULL, &init_mm, NULL); } @@ -126,8 +128,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != - (__sme_pa(real_prev->pgd) | prev_asid))) { + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. @@ -143,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, __flush_tlb_all(); } #endif + this_cpu_write(cpu_tlbstate.is_lazy, false); if (real_prev == next) { VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); - if (cpumask_test_cpu(cpu, mm_cpumask(next))) { - /* - * There's nothing to do: we weren't lazy, and we - * aren't changing our mm. We don't need to flush - * anything, nor do we need to update CR3, CR4, or - * LDTR. - */ - return; - } - - /* Resume remote flushes and then read tlb_gen. */ - cpumask_set_cpu(cpu, mm_cpumask(next)); - next_tlb_gen = atomic64_read(&next->context.tlb_gen); - - if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < - next_tlb_gen) { - /* - * Ideally, we'd have a flush_tlb() variant that - * takes the known CR3 value as input. This would - * be faster on Xen PV and on hypothetical CPUs - * on which INVPCID is fast. - */ - this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, - next_tlb_gen); - write_cr3(__sme_pa(next->pgd) | prev_asid); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, - TLB_FLUSH_ALL); - } - /* - * We just exited lazy mode, which means that CR4 and/or LDTR - * may be stale. (Changes to the required CR4 and LDTR states - * are not reflected in tlb_gen.) + * We don't currently support having a real mm loaded without + * our cpu set in mm_cpumask(). We have all the bookkeeping + * in place to figure out whether we would need to flush + * if our cpu were cleared in mm_cpumask(), but we don't + * currently use it. */ + if (WARN_ON_ONCE(real_prev != &init_mm && + !cpumask_test_cpu(cpu, mm_cpumask(next)))) + cpumask_set_cpu(cpu, mm_cpumask(next)); + + return; } else { u16 new_asid; bool need_flush; @@ -192,7 +172,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ - unsigned int index = pgd_index(current_stack_pointer()); + unsigned int index = pgd_index(current_stack_pointer); pgd_t *pgd = next->pgd + index; if (unlikely(pgd_none(*pgd))) @@ -200,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, } /* Stop remote flushes for the previous mm */ - if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - - VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && + real_prev != &init_mm); + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); /* * Start remote flushes and then read tlb_gen. @@ -216,12 +195,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - write_cr3(__sme_pa(next->pgd) | new_asid); + write_cr3(build_cr3(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); + write_cr3(build_cr3_noflush(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } @@ -234,6 +213,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, } /* + * enter_lazy_tlb() is a hint from the scheduler that we are entering a + * kernel thread or other context without an mm. Acceptable implementations + * include doing nothing whatsoever, switching to init_mm, or various clever + * lazy tricks to try to minimize TLB flushes. + * + * The scheduler reserves the right to call enter_lazy_tlb() several times + * in a row. It will notify us that we're going back to a real mm by + * calling switch_mm_irqs_off(). + */ +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) + return; + + if (static_branch_unlikely(&tlb_use_lazy_mode)) { + /* + * There's a significant optimization that may be possible + * here. We have accurate enough TLB flush tracking that we + * don't need to maintain coherence of TLB per se when we're + * lazy. We do, however, need to maintain coherence of + * paging-structure caches. We could, in principle, leave our + * old mm loaded and only switch to init_mm when + * tlb_remove_page() happens. + */ + this_cpu_write(cpu_tlbstate.is_lazy, true); + } else { + switch_mm(NULL, &init_mm, NULL); + } +} + +/* * Call this when reinitializing a CPU. It fixes the following potential * problems: * @@ -265,7 +275,7 @@ void initialize_tlbstate_and_flush(void) !(cr4_read_shadow() & X86_CR4_PCIDE)); /* Force ASID 0 and force a TLB flush. */ - write_cr3(cr3 & ~CR3_PCID_MASK); + write_cr3(build_cr3(mm, 0)); /* Reinitialize tlbstate. */ this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); @@ -304,16 +314,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); + if (unlikely(loaded_mm == &init_mm)) + return; + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != loaded_mm->context.ctx_id); - if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { + if (this_cpu_read(cpu_tlbstate.is_lazy)) { /* - * We're in lazy mode -- don't flush. We can get here on - * remote flushes due to races and on local flushes if a - * kernel thread coincidentally flushes the mm it's lazily - * still using. + * We're in lazy mode. We need to at least flush our + * paging-structure cache to avoid speculatively reading + * garbage into our TLB. Since switching to init_mm is barely + * slower than a minimal flush, just switch to init_mm. */ + switch_mm_irqs_off(NULL, &init_mm, NULL); return; } @@ -612,3 +626,57 @@ static int __init create_tlb_single_page_flush_ceiling(void) return 0; } late_initcall(create_tlb_single_page_flush_ceiling); + +static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[2]; + + buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0'; + buf[1] = '\n'; + + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t tlblazy_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + bool val; + + if (kstrtobool_from_user(user_buf, count, &val)) + return -EINVAL; + + if (val) + static_branch_enable(&tlb_use_lazy_mode); + else + static_branch_disable(&tlb_use_lazy_mode); + + return count; +} + +static const struct file_operations fops_tlblazy = { + .read = tlblazy_read_file, + .write = tlblazy_write_file, + .llseek = default_llseek, +}; + +static int __init init_tlb_use_lazy_mode(void) +{ + if (boot_cpu_has(X86_FEATURE_PCID)) { + /* + * Heuristic: with PCID on, switching to and from + * init_mm is reasonably fast, but remote flush IPIs + * as expensive as ever, so turn off lazy TLB mode. + * + * We can't do this in setup_pcid() because static keys + * haven't been initialized yet, and it would blow up + * badly. + */ + static_branch_disable(&tlb_use_lazy_mode); + } + + debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_tlblazy); + return 0; +} +late_initcall(init_tlb_use_lazy_mode); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8c9573660d51..0554e8aef4d5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog) /* if (index >= array->map.max_entries) * goto out; */ - EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ + EMIT2(0x89, 0xD2); /* mov edx, edx */ + EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); - EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ #define OFFSET1 43 /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0e7ef69e8531..d669e9d89001 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -93,11 +93,11 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), int rc; rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, - "x86/xen/hvm_guest:prepare", + "x86/xen/guest:prepare", cpu_up_prepare_cb, cpu_dead_cb); if (rc >= 0) { rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "x86/xen/hvm_guest:online", + "x86/xen/guest:online", xen_cpu_up_online, NULL); if (rc < 0) cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 509f560bd0c6..71495f1a86d7 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1238,21 +1238,16 @@ static void __init xen_pagetable_cleanhighmap(void) * from _brk_limit way up to the max_pfn_mapped (which is the end of * the ramdisk). We continue on, erasing PMD entries that point to page * tables - do note that they are accessible at this stage via __va. - * For good measure we also round up to the PMD - which means that if + * As Xen is aligning the memory end to a 4MB boundary, for good + * measure we also round up to PMD_SIZE * 2 - which means that if * anybody is using __ka address to the initial boot-stack - and try * to use it - they are going to crash. The xen_start_info has been * taken care of already in xen_setup_kernel_pagetable. */ addr = xen_start_info->pt_base; - size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); + size = xen_start_info->nr_pt_frames * PAGE_SIZE; - xen_cleanhighmap(addr, addr + size); + xen_cleanhighmap(addr, roundup(addr + size, PMD_SIZE * 2)); xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); -#ifdef DEBUG - /* This is superfluous and is not necessary, but you know what - * lets do it. The MODULES_VADDR -> MODULES_END should be clear of - * anything at this stage. */ - xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); -#endif } #endif @@ -2220,7 +2215,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) * not the first page table in the page table pool. * Iterate through the initial page tables to find the real page table base. */ -static phys_addr_t xen_find_pt_base(pmd_t *pmd) +static phys_addr_t __init xen_find_pt_base(pmd_t *pmd) { phys_addr_t pt_base, paddr; unsigned pmdidx; |