diff options
Diffstat (limited to 'arch/x86/crypto')
22 files changed, 509 insertions, 494 deletions
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index cdf3215ec272..ad7f4c891625 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -201,8 +201,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_init) movdqa KEY, STATE4 /* load the constants: */ - movdqa .Laegis128_const_0, STATE2 - movdqa .Laegis128_const_1, STATE1 + movdqa .Laegis128_const_0(%rip), STATE2 + movdqa .Laegis128_const_1(%rip), STATE1 pxor STATE2, STATE3 pxor STATE1, STATE4 @@ -682,7 +682,7 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) punpcklbw T0, T0 punpcklbw T0, T0 punpcklbw T0, T0 - movdqa .Laegis128_counter, T1 + movdqa .Laegis128_counter(%rip), T1 pcmpgtb T1, T0 pand T0, MSG diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 837c1e0aa021..3ac7487ecad2 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -288,53 +288,53 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff # Encrypt/Decrypt first few blocks and $(3<<4), %r12 - jz _initial_num_blocks_is_0_\@ + jz .L_initial_num_blocks_is_0_\@ cmp $(2<<4), %r12 - jb _initial_num_blocks_is_1_\@ - je _initial_num_blocks_is_2_\@ -_initial_num_blocks_is_3_\@: + jb .L_initial_num_blocks_is_1_\@ + je .L_initial_num_blocks_is_2_\@ +.L_initial_num_blocks_is_3_\@: INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation sub $48, %r13 - jmp _initial_blocks_\@ -_initial_num_blocks_is_2_\@: + jmp .L_initial_blocks_\@ +.L_initial_num_blocks_is_2_\@: INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation sub $32, %r13 - jmp _initial_blocks_\@ -_initial_num_blocks_is_1_\@: + jmp .L_initial_blocks_\@ +.L_initial_num_blocks_is_1_\@: INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation sub $16, %r13 - jmp _initial_blocks_\@ -_initial_num_blocks_is_0_\@: + jmp .L_initial_blocks_\@ +.L_initial_num_blocks_is_0_\@: INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation -_initial_blocks_\@: +.L_initial_blocks_\@: # Main loop - Encrypt/Decrypt remaining blocks test %r13, %r13 - je _zero_cipher_left_\@ + je .L_zero_cipher_left_\@ sub $64, %r13 - je _four_cipher_left_\@ -_crypt_by_4_\@: + je .L_four_cipher_left_\@ +.L_crypt_by_4_\@: GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ %xmm7, %xmm8, enc add $64, %r11 sub $64, %r13 - jne _crypt_by_4_\@ -_four_cipher_left_\@: + jne .L_crypt_by_4_\@ +.L_four_cipher_left_\@: GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 -_zero_cipher_left_\@: +.L_zero_cipher_left_\@: movdqu %xmm8, AadHash(%arg2) movdqu %xmm0, CurCount(%arg2) mov %arg5, %r13 and $15, %r13 # %r13 = arg5 (mod 16) - je _multiple_of_16_bytes_\@ + je .L_multiple_of_16_bytes_\@ mov %r13, PBlockLen(%arg2) @@ -348,14 +348,14 @@ _zero_cipher_left_\@: movdqu %xmm0, PBlockEncKey(%arg2) cmp $16, %arg5 - jge _large_enough_update_\@ + jge .L_large_enough_update_\@ lea (%arg4,%r11,1), %r10 mov %r13, %r12 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 - jmp _data_read_\@ + jmp .L_data_read_\@ -_large_enough_update_\@: +.L_large_enough_update_\@: sub $16, %r11 add %r13, %r11 @@ -374,7 +374,7 @@ _large_enough_update_\@: # shift right 16-r13 bytes pshufb %xmm2, %xmm1 -_data_read_\@: +.L_data_read_\@: lea ALL_F+16(%rip), %r12 sub %r13, %r12 @@ -409,19 +409,19 @@ _data_read_\@: # Output %r13 bytes movq %xmm0, %rax cmp $8, %r13 - jle _less_than_8_bytes_left_\@ + jle .L_less_than_8_bytes_left_\@ mov %rax, (%arg3 , %r11, 1) add $8, %r11 psrldq $8, %xmm0 movq %xmm0, %rax sub $8, %r13 -_less_than_8_bytes_left_\@: +.L_less_than_8_bytes_left_\@: mov %al, (%arg3, %r11, 1) add $1, %r11 shr $8, %rax sub $1, %r13 - jne _less_than_8_bytes_left_\@ -_multiple_of_16_bytes_\@: + jne .L_less_than_8_bytes_left_\@ +.L_multiple_of_16_bytes_\@: .endm # GCM_COMPLETE Finishes update of tag of last partial block @@ -434,11 +434,11 @@ _multiple_of_16_bytes_\@: mov PBlockLen(%arg2), %r12 test %r12, %r12 - je _partial_done\@ + je .L_partial_done\@ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 -_partial_done\@: +.L_partial_done\@: mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) shl $3, %r12 # convert into number of bits movd %r12d, %xmm15 # len(A) in %xmm15 @@ -457,44 +457,44 @@ _partial_done\@: movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) pxor %xmm8, %xmm0 -_return_T_\@: +.L_return_T_\@: mov \AUTHTAG, %r10 # %r10 = authTag mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len cmp $16, %r11 - je _T_16_\@ + je .L_T_16_\@ cmp $8, %r11 - jl _T_4_\@ -_T_8_\@: + jl .L_T_4_\@ +.L_T_8_\@: movq %xmm0, %rax mov %rax, (%r10) add $8, %r10 sub $8, %r11 psrldq $8, %xmm0 test %r11, %r11 - je _return_T_done_\@ -_T_4_\@: + je .L_return_T_done_\@ +.L_T_4_\@: movd %xmm0, %eax mov %eax, (%r10) add $4, %r10 sub $4, %r11 psrldq $4, %xmm0 test %r11, %r11 - je _return_T_done_\@ -_T_123_\@: + je .L_return_T_done_\@ +.L_T_123_\@: movd %xmm0, %eax cmp $2, %r11 - jl _T_1_\@ + jl .L_T_1_\@ mov %ax, (%r10) cmp $2, %r11 - je _return_T_done_\@ + je .L_return_T_done_\@ add $2, %r10 sar $16, %eax -_T_1_\@: +.L_T_1_\@: mov %al, (%r10) - jmp _return_T_done_\@ -_T_16_\@: + jmp .L_return_T_done_\@ +.L_T_16_\@: movdqu %xmm0, (%r10) -_return_T_done_\@: +.L_return_T_done_\@: .endm #ifdef __x86_64__ @@ -563,30 +563,30 @@ _return_T_done_\@: # Clobbers %rax, DLEN and XMM1 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst cmp $8, \DLEN - jl _read_lt8_\@ + jl .L_read_lt8_\@ mov (\DPTR), %rax movq %rax, \XMMDst sub $8, \DLEN - jz _done_read_partial_block_\@ + jz .L_done_read_partial_block_\@ xor %eax, %eax -_read_next_byte_\@: +.L_read_next_byte_\@: shl $8, %rax mov 7(\DPTR, \DLEN, 1), %al dec \DLEN - jnz _read_next_byte_\@ + jnz .L_read_next_byte_\@ movq %rax, \XMM1 pslldq $8, \XMM1 por \XMM1, \XMMDst - jmp _done_read_partial_block_\@ -_read_lt8_\@: + jmp .L_done_read_partial_block_\@ +.L_read_lt8_\@: xor %eax, %eax -_read_next_byte_lt8_\@: +.L_read_next_byte_lt8_\@: shl $8, %rax mov -1(\DPTR, \DLEN, 1), %al dec \DLEN - jnz _read_next_byte_lt8_\@ + jnz .L_read_next_byte_lt8_\@ movq %rax, \XMMDst -_done_read_partial_block_\@: +.L_done_read_partial_block_\@: .endm # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. @@ -600,8 +600,8 @@ _done_read_partial_block_\@: pxor \TMP6, \TMP6 cmp $16, %r11 - jl _get_AAD_rest\@ -_get_AAD_blocks\@: + jl .L_get_AAD_rest\@ +.L_get_AAD_blocks\@: movdqu (%r10), \TMP7 pshufb %xmm14, \TMP7 # byte-reflect the AAD data pxor \TMP7, \TMP6 @@ -609,14 +609,14 @@ _get_AAD_blocks\@: add $16, %r10 sub $16, %r11 cmp $16, %r11 - jge _get_AAD_blocks\@ + jge .L_get_AAD_blocks\@ movdqu \TMP6, \TMP7 /* read the last <16B of AAD */ -_get_AAD_rest\@: +.L_get_AAD_rest\@: test %r11, %r11 - je _get_AAD_done\@ + je .L_get_AAD_done\@ READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 pshufb %xmm14, \TMP7 # byte-reflect the AAD data @@ -624,7 +624,7 @@ _get_AAD_rest\@: GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 movdqu \TMP7, \TMP6 -_get_AAD_done\@: +.L_get_AAD_done\@: movdqu \TMP6, AadHash(%arg2) .endm @@ -637,21 +637,21 @@ _get_AAD_done\@: AAD_HASH operation mov PBlockLen(%arg2), %r13 test %r13, %r13 - je _partial_block_done_\@ # Leave Macro if no partial blocks + je .L_partial_block_done_\@ # Leave Macro if no partial blocks # Read in input data without over reading cmp $16, \PLAIN_CYPH_LEN - jl _fewer_than_16_bytes_\@ + jl .L_fewer_than_16_bytes_\@ movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm - jmp _data_read_\@ + jmp .L_data_read_\@ -_fewer_than_16_bytes_\@: +.L_fewer_than_16_bytes_\@: lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 mov \PLAIN_CYPH_LEN, %r12 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 mov PBlockLen(%arg2), %r13 -_data_read_\@: # Finished reading in data +.L_data_read_\@: # Finished reading in data movdqu PBlockEncKey(%arg2), %xmm9 movdqu HashKey(%arg2), %xmm13 @@ -674,9 +674,9 @@ _data_read_\@: # Finished reading in data sub $16, %r10 # Determine if if partial block is not being filled and # shift mask accordingly - jge _no_extra_mask_1_\@ + jge .L_no_extra_mask_1_\@ sub %r10, %r12 -_no_extra_mask_1_\@: +.L_no_extra_mask_1_\@: movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out bottom r13 bytes of xmm9 @@ -689,17 +689,17 @@ _no_extra_mask_1_\@: pxor %xmm3, \AAD_HASH test %r10, %r10 - jl _partial_incomplete_1_\@ + jl .L_partial_incomplete_1_\@ # GHASH computation for the last <16 Byte block GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 xor %eax, %eax mov %rax, PBlockLen(%arg2) - jmp _dec_done_\@ -_partial_incomplete_1_\@: + jmp .L_dec_done_\@ +.L_partial_incomplete_1_\@: add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -_dec_done_\@: +.L_dec_done_\@: movdqu \AAD_HASH, AadHash(%arg2) .else pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) @@ -710,9 +710,9 @@ _dec_done_\@: sub $16, %r10 # Determine if if partial block is not being filled and # shift mask accordingly - jge _no_extra_mask_2_\@ + jge .L_no_extra_mask_2_\@ sub %r10, %r12 -_no_extra_mask_2_\@: +.L_no_extra_mask_2_\@: movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out bottom r13 bytes of xmm9 @@ -724,17 +724,17 @@ _no_extra_mask_2_\@: pxor %xmm9, \AAD_HASH test %r10, %r10 - jl _partial_incomplete_2_\@ + jl .L_partial_incomplete_2_\@ # GHASH computation for the last <16 Byte block GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 xor %eax, %eax mov %rax, PBlockLen(%arg2) - jmp _encode_done_\@ -_partial_incomplete_2_\@: + jmp .L_encode_done_\@ +.L_partial_incomplete_2_\@: add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -_encode_done_\@: +.L_encode_done_\@: movdqu \AAD_HASH, AadHash(%arg2) movdqa SHUF_MASK(%rip), %xmm10 @@ -744,32 +744,32 @@ _encode_done_\@: .endif # output encrypted Bytes test %r10, %r10 - jl _partial_fill_\@ + jl .L_partial_fill_\@ mov %r13, %r12 mov $16, %r13 # Set r13 to be the number of bytes to write out sub %r12, %r13 - jmp _count_set_\@ -_partial_fill_\@: + jmp .L_count_set_\@ +.L_partial_fill_\@: mov \PLAIN_CYPH_LEN, %r13 -_count_set_\@: +.L_count_set_\@: movdqa %xmm9, %xmm0 movq %xmm0, %rax cmp $8, %r13 - jle _less_than_8_bytes_left_\@ + jle .L_less_than_8_bytes_left_\@ mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) add $8, \DATA_OFFSET psrldq $8, %xmm0 movq %xmm0, %rax sub $8, %r13 -_less_than_8_bytes_left_\@: +.L_less_than_8_bytes_left_\@: movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) add $1, \DATA_OFFSET shr $8, %rax sub $1, %r13 - jne _less_than_8_bytes_left_\@ -_partial_block_done_\@: + jne .L_less_than_8_bytes_left_\@ +.L_partial_block_done_\@: .endm # PARTIAL_BLOCK /* @@ -813,14 +813,14 @@ _partial_block_done_\@: shr $2,%eax # 128->4, 192->6, 256->8 add $5,%eax # 128->9, 192->11, 256->13 -aes_loop_initial_\@: +.Laes_loop_initial_\@: MOVADQ (%r10),\TMP1 .irpc index, \i_seq aesenc \TMP1, %xmm\index .endr add $16,%r10 sub $1,%eax - jnz aes_loop_initial_\@ + jnz .Laes_loop_initial_\@ MOVADQ (%r10), \TMP1 .irpc index, \i_seq @@ -861,7 +861,7 @@ aes_loop_initial_\@: GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 .endif cmp $64, %r13 - jl _initial_blocks_done\@ + jl .L_initial_blocks_done\@ # no need for precomputed values /* * @@ -908,18 +908,18 @@ aes_loop_initial_\@: mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 sub $4,%eax # 128->0, 192->2, 256->4 - jz aes_loop_pre_done\@ + jz .Laes_loop_pre_done\@ -aes_loop_pre_\@: +.Laes_loop_pre_\@: MOVADQ (%r10),\TMP2 .irpc index, 1234 aesenc \TMP2, %xmm\index .endr add $16,%r10 sub $1,%eax - jnz aes_loop_pre_\@ + jnz .Laes_loop_pre_\@ -aes_loop_pre_done\@: +.Laes_loop_pre_done\@: MOVADQ (%r10), \TMP2 aesenclast \TMP2, \XMM1 aesenclast \TMP2, \XMM2 @@ -963,7 +963,7 @@ aes_loop_pre_done\@: pshufb %xmm14, \XMM3 # perform a 16 byte swap pshufb %xmm14, \XMM4 # perform a 16 byte swap -_initial_blocks_done\@: +.L_initial_blocks_done\@: .endm @@ -1095,18 +1095,18 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 sub $4,%eax # 128->0, 192->2, 256->4 - jz aes_loop_par_enc_done\@ + jz .Laes_loop_par_enc_done\@ -aes_loop_par_enc\@: +.Laes_loop_par_enc\@: MOVADQ (%r10),\TMP3 .irpc index, 1234 aesenc \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax - jnz aes_loop_par_enc\@ + jnz .Laes_loop_par_enc\@ -aes_loop_par_enc_done\@: +.Laes_loop_par_enc_done\@: MOVADQ (%r10), \TMP3 aesenclast \TMP3, \XMM1 # Round 10 aesenclast \TMP3, \XMM2 @@ -1303,18 +1303,18 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 sub $4,%eax # 128->0, 192->2, 256->4 - jz aes_loop_par_dec_done\@ + jz .Laes_loop_par_dec_done\@ -aes_loop_par_dec\@: +.Laes_loop_par_dec\@: MOVADQ (%r10),\TMP3 .irpc index, 1234 aesenc \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax - jnz aes_loop_par_dec\@ + jnz .Laes_loop_par_dec\@ -aes_loop_par_dec_done\@: +.Laes_loop_par_dec_done\@: MOVADQ (%r10), \TMP3 aesenclast \TMP3, \XMM1 # last round aesenclast \TMP3, \XMM2 @@ -2717,7 +2717,7 @@ SYM_FUNC_END(aesni_cts_cbc_dec) * BSWAP_MASK == endian swapping mask */ SYM_FUNC_START_LOCAL(_aesni_inc_init) - movaps .Lbswap_mask, BSWAP_MASK + movaps .Lbswap_mask(%rip), BSWAP_MASK movaps IV, CTR pshufb BSWAP_MASK, CTR mov $1, TCTR_LOW diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 0852ab573fd3..46cddd78857b 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -154,30 +154,6 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 ALL_F: .octa 0xffffffffffffffffffffffffffffffff .octa 0x00000000000000000000000000000000 -.section .rodata -.align 16 -.type aad_shift_arr, @object -.size aad_shift_arr, 272 -aad_shift_arr: - .octa 0xffffffffffffffffffffffffffffffff - .octa 0xffffffffffffffffffffffffffffff0C - .octa 0xffffffffffffffffffffffffffff0D0C - .octa 0xffffffffffffffffffffffffff0E0D0C - .octa 0xffffffffffffffffffffffff0F0E0D0C - .octa 0xffffffffffffffffffffff0C0B0A0908 - .octa 0xffffffffffffffffffff0D0C0B0A0908 - .octa 0xffffffffffffffffff0E0D0C0B0A0908 - .octa 0xffffffffffffffff0F0E0D0C0B0A0908 - .octa 0xffffffffffffff0C0B0A090807060504 - .octa 0xffffffffffff0D0C0B0A090807060504 - .octa 0xffffffffff0E0D0C0B0A090807060504 - .octa 0xffffffff0F0E0D0C0B0A090807060504 - .octa 0xffffff0C0B0A09080706050403020100 - .octa 0xffff0D0C0B0A09080706050403020100 - .octa 0xff0E0D0C0B0A09080706050403020100 - .octa 0x0F0E0D0C0B0A09080706050403020100 - - .text @@ -302,68 +278,68 @@ VARIABLE_OFFSET = 16*8 mov %r13, %r12 shr $4, %r12 and $7, %r12 - jz _initial_num_blocks_is_0\@ + jz .L_initial_num_blocks_is_0\@ cmp $7, %r12 - je _initial_num_blocks_is_7\@ + je .L_initial_num_blocks_is_7\@ cmp $6, %r12 - je _initial_num_blocks_is_6\@ + je .L_initial_num_blocks_is_6\@ cmp $5, %r12 - je _initial_num_blocks_is_5\@ + je .L_initial_num_blocks_is_5\@ cmp $4, %r12 - je _initial_num_blocks_is_4\@ + je .L_initial_num_blocks_is_4\@ cmp $3, %r12 - je _initial_num_blocks_is_3\@ + je .L_initial_num_blocks_is_3\@ cmp $2, %r12 - je _initial_num_blocks_is_2\@ + je .L_initial_num_blocks_is_2\@ - jmp _initial_num_blocks_is_1\@ + jmp .L_initial_num_blocks_is_1\@ -_initial_num_blocks_is_7\@: +.L_initial_num_blocks_is_7\@: \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*7, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_6\@: +.L_initial_num_blocks_is_6\@: \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*6, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_5\@: +.L_initial_num_blocks_is_5\@: \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*5, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_4\@: +.L_initial_num_blocks_is_4\@: \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*4, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_3\@: +.L_initial_num_blocks_is_3\@: \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*3, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_2\@: +.L_initial_num_blocks_is_2\@: \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*2, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_1\@: +.L_initial_num_blocks_is_1\@: \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*1, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_0\@: +.L_initial_num_blocks_is_0\@: \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC -_initial_blocks_encrypted\@: +.L_initial_blocks_encrypted\@: test %r13, %r13 - je _zero_cipher_left\@ + je .L_zero_cipher_left\@ sub $128, %r13 - je _eight_cipher_left\@ + je .L_eight_cipher_left\@ @@ -373,9 +349,9 @@ _initial_blocks_encrypted\@: vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 -_encrypt_by_8_new\@: +.L_encrypt_by_8_new\@: cmp $(255-8), %r15d - jg _encrypt_by_8\@ + jg .L_encrypt_by_8\@ @@ -383,30 +359,30 @@ _encrypt_by_8_new\@: \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC add $128, %r11 sub $128, %r13 - jne _encrypt_by_8_new\@ + jne .L_encrypt_by_8_new\@ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - jmp _eight_cipher_left\@ + jmp .L_eight_cipher_left\@ -_encrypt_by_8\@: +.L_encrypt_by_8\@: vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 add $8, %r15b \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 add $128, %r11 sub $128, %r13 - jne _encrypt_by_8_new\@ + jne .L_encrypt_by_8_new\@ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 -_eight_cipher_left\@: +.L_eight_cipher_left\@: \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 -_zero_cipher_left\@: +.L_zero_cipher_left\@: vmovdqu %xmm14, AadHash(arg2) vmovdqu %xmm9, CurCount(arg2) @@ -414,7 +390,7 @@ _zero_cipher_left\@: mov arg5, %r13 and $15, %r13 # r13 = (arg5 mod 16) - je _multiple_of_16_bytes\@ + je .L_multiple_of_16_bytes\@ # handle the last <16 Byte block separately @@ -428,7 +404,7 @@ _zero_cipher_left\@: vmovdqu %xmm9, PBlockEncKey(arg2) cmp $16, arg5 - jge _large_enough_update\@ + jge .L_large_enough_update\@ lea (arg4,%r11,1), %r10 mov %r13, %r12 @@ -440,9 +416,9 @@ _zero_cipher_left\@: # able to shift 16-r13 bytes (r13 is the # number of bytes in plaintext mod 16) - jmp _final_ghash_mul\@ + jmp .L_final_ghash_mul\@ -_large_enough_update\@: +.L_large_enough_update\@: sub $16, %r11 add %r13, %r11 @@ -461,7 +437,7 @@ _large_enough_update\@: # shift right 16-r13 bytes vpshufb %xmm2, %xmm1, %xmm1 -_final_ghash_mul\@: +.L_final_ghash_mul\@: .if \ENC_DEC == DEC vmovdqa %xmm1, %xmm2 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) @@ -490,7 +466,7 @@ _final_ghash_mul\@: # output r13 Bytes vmovq %xmm9, %rax cmp $8, %r13 - jle _less_than_8_bytes_left\@ + jle .L_less_than_8_bytes_left\@ mov %rax, (arg3 , %r11) add $8, %r11 @@ -498,15 +474,15 @@ _final_ghash_mul\@: vmovq %xmm9, %rax sub $8, %r13 -_less_than_8_bytes_left\@: +.L_less_than_8_bytes_left\@: movb %al, (arg3 , %r11) add $1, %r11 shr $8, %rax sub $1, %r13 - jne _less_than_8_bytes_left\@ + jne .L_less_than_8_bytes_left\@ ############################# -_multiple_of_16_bytes\@: +.L_multiple_of_16_bytes\@: .endm @@ -519,12 +495,12 @@ _multiple_of_16_bytes\@: mov PBlockLen(arg2), %r12 test %r12, %r12 - je _partial_done\@ + je .L_partial_done\@ #GHASH computation for the last <16 Byte block \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 -_partial_done\@: +.L_partial_done\@: mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) shl $3, %r12 # convert into number of bits vmovd %r12d, %xmm15 # len(A) in xmm15 @@ -547,49 +523,49 @@ _partial_done\@: -_return_T\@: +.L_return_T\@: mov \AUTH_TAG, %r10 # r10 = authTag mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len cmp $16, %r11 - je _T_16\@ + je .L_T_16\@ cmp $8, %r11 - jl _T_4\@ + jl .L_T_4\@ -_T_8\@: +.L_T_8\@: vmovq %xmm9, %rax mov %rax, (%r10) add $8, %r10 sub $8, %r11 vpsrldq $8, %xmm9, %xmm9 test %r11, %r11 - je _return_T_done\@ -_T_4\@: + je .L_return_T_done\@ +.L_T_4\@: vmovd %xmm9, %eax mov %eax, (%r10) add $4, %r10 sub $4, %r11 vpsrldq $4, %xmm9, %xmm9 test %r11, %r11 - je _return_T_done\@ -_T_123\@: + je .L_return_T_done\@ +.L_T_123\@: vmovd %xmm9, %eax cmp $2, %r11 - jl _T_1\@ + jl .L_T_1\@ mov %ax, (%r10) cmp $2, %r11 - je _return_T_done\@ + je .L_return_T_done\@ add $2, %r10 sar $16, %eax -_T_1\@: +.L_T_1\@: mov %al, (%r10) - jmp _return_T_done\@ + jmp .L_return_T_done\@ -_T_16\@: +.L_T_16\@: vmovdqu %xmm9, (%r10) -_return_T_done\@: +.L_return_T_done\@: .endm .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 @@ -603,8 +579,8 @@ _return_T_done\@: vpxor \T8, \T8, \T8 vpxor \T7, \T7, \T7 cmp $16, %r11 - jl _get_AAD_rest8\@ -_get_AAD_blocks\@: + jl .L_get_AAD_rest8\@ +.L_get_AAD_blocks\@: vmovdqu (%r10), \T7 vpshufb SHUF_MASK(%rip), \T7, \T7 vpxor \T7, \T8, \T8 @@ -613,29 +589,29 @@ _get_AAD_blocks\@: sub $16, %r12 sub $16, %r11 cmp $16, %r11 - jge _get_AAD_blocks\@ + jge .L_get_AAD_blocks\@ vmovdqu \T8, \T7 test %r11, %r11 - je _get_AAD_done\@ + je .L_get_AAD_done\@ vpxor \T7, \T7, \T7 /* read the last <16B of AAD. since we have at least 4B of data right after the AAD (the ICV, and maybe some CT), we can read 4B/8B blocks safely, and then get rid of the extra stuff */ -_get_AAD_rest8\@: +.L_get_AAD_rest8\@: cmp $4, %r11 - jle _get_AAD_rest4\@ + jle .L_get_AAD_rest4\@ movq (%r10), \T1 add $8, %r10 sub $8, %r11 vpslldq $8, \T1, \T1 vpsrldq $8, \T7, \T7 vpxor \T1, \T7, \T7 - jmp _get_AAD_rest8\@ -_get_AAD_rest4\@: + jmp .L_get_AAD_rest8\@ +.L_get_AAD_rest4\@: test %r11, %r11 - jle _get_AAD_rest0\@ + jle .L_get_AAD_rest0\@ mov (%r10), %eax movq %rax, \T1 add $4, %r10 @@ -643,20 +619,22 @@ _get_AAD_rest4\@: vpslldq $12, \T1, \T1 vpsrldq $4, \T7, \T7 vpxor \T1, \T7, \T7 -_get_AAD_rest0\@: +.L_get_AAD_rest0\@: /* finalize: shift out the extra bytes we read, and align left. since pslldq can only shift by an immediate, we use - vpshufb and an array of shuffle masks */ - movq %r12, %r11 - salq $4, %r11 - vmovdqu aad_shift_arr(%r11), \T1 - vpshufb \T1, \T7, \T7 -_get_AAD_rest_final\@: + vpshufb and a pair of shuffle masks */ + leaq ALL_F(%rip), %r11 + subq %r12, %r11 + vmovdqu 16(%r11), \T1 + andq $~3, %r11 + vpshufb (%r11), \T7, \T7 + vpand \T1, \T7, \T7 +.L_get_AAD_rest_final\@: vpshufb SHUF_MASK(%rip), \T7, \T7 vpxor \T8, \T7, \T7 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 -_get_AAD_done\@: +.L_get_AAD_done\@: vmovdqu \T7, AadHash(arg2) .endm @@ -707,28 +685,28 @@ _get_AAD_done\@: vpxor \XMMDst, \XMMDst, \XMMDst cmp $8, \DLEN - jl _read_lt8_\@ + jl .L_read_lt8_\@ mov (\DPTR), %rax vpinsrq $0, %rax, \XMMDst, \XMMDst sub $8, \DLEN - jz _done_read_partial_block_\@ + jz .L_done_read_partial_block_\@ xor %eax, %eax -_read_next_byte_\@: +.L_read_next_byte_\@: shl $8, %rax mov 7(\DPTR, \DLEN, 1), %al dec \DLEN - jnz _read_next_byte_\@ + jnz .L_read_next_byte_\@ vpinsrq $1, %rax, \XMMDst, \XMMDst - jmp _done_read_partial_block_\@ -_read_lt8_\@: + jmp .L_done_read_partial_block_\@ +.L_read_lt8_\@: xor %eax, %eax -_read_next_byte_lt8_\@: +.L_read_next_byte_lt8_\@: shl $8, %rax mov -1(\DPTR, \DLEN, 1), %al dec \DLEN - jnz _read_next_byte_lt8_\@ + jnz .L_read_next_byte_lt8_\@ vpinsrq $0, %rax, \XMMDst, \XMMDst -_done_read_partial_block_\@: +.L_done_read_partial_block_\@: .endm # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks @@ -740,21 +718,21 @@ _done_read_partial_block_\@: AAD_HASH ENC_DEC mov PBlockLen(arg2), %r13 test %r13, %r13 - je _partial_block_done_\@ # Leave Macro if no partial blocks + je .L_partial_block_done_\@ # Leave Macro if no partial blocks # Read in input data without over reading cmp $16, \PLAIN_CYPH_LEN - jl _fewer_than_16_bytes_\@ + jl .L_fewer_than_16_bytes_\@ vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm - jmp _data_read_\@ + jmp .L_data_read_\@ -_fewer_than_16_bytes_\@: +.L_fewer_than_16_bytes_\@: lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 mov \PLAIN_CYPH_LEN, %r12 READ_PARTIAL_BLOCK %r10 %r12 %xmm1 mov PBlockLen(arg2), %r13 -_data_read_\@: # Finished reading in data +.L_data_read_\@: # Finished reading in data vmovdqu PBlockEncKey(arg2), %xmm9 vmovdqu HashKey(arg2), %xmm13 @@ -777,9 +755,9 @@ _data_read_\@: # Finished reading in data sub $16, %r10 # Determine if if partial block is not being filled and # shift mask accordingly - jge _no_extra_mask_1_\@ + jge .L_no_extra_mask_1_\@ sub %r10, %r12 -_no_extra_mask_1_\@: +.L_no_extra_mask_1_\@: vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out bottom r13 bytes of xmm9 @@ -792,17 +770,17 @@ _no_extra_mask_1_\@: vpxor %xmm3, \AAD_HASH, \AAD_HASH test %r10, %r10 - jl _partial_incomplete_1_\@ + jl .L_partial_incomplete_1_\@ # GHASH computation for the last <16 Byte block \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 xor %eax,%eax mov %rax, PBlockLen(arg2) - jmp _dec_done_\@ -_partial_incomplete_1_\@: + jmp .L_dec_done_\@ +.L_partial_incomplete_1_\@: add \PLAIN_CYPH_LEN, PBlockLen(arg2) -_dec_done_\@: +.L_dec_done_\@: vmovdqu \AAD_HASH, AadHash(arg2) .else vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) @@ -813,9 +791,9 @@ _dec_done_\@: sub $16, %r10 # Determine if if partial block is not being filled and # shift mask accordingly - jge _no_extra_mask_2_\@ + jge .L_no_extra_mask_2_\@ sub %r10, %r12 -_no_extra_mask_2_\@: +.L_no_extra_mask_2_\@: vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out bottom r13 bytes of xmm9 @@ -827,17 +805,17 @@ _no_extra_mask_2_\@: vpxor %xmm9, \AAD_HASH, \AAD_HASH test %r10, %r10 - jl _partial_incomplete_2_\@ + jl .L_partial_incomplete_2_\@ # GHASH computation for the last <16 Byte block \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 xor %eax,%eax mov %rax, PBlockLen(arg2) - jmp _encode_done_\@ -_partial_incomplete_2_\@: + jmp .L_encode_done_\@ +.L_partial_incomplete_2_\@: add \PLAIN_CYPH_LEN, PBlockLen(arg2) -_encode_done_\@: +.L_encode_done_\@: vmovdqu \AAD_HASH, AadHash(arg2) vmovdqa SHUF_MASK(%rip), %xmm10 @@ -847,32 +825,32 @@ _encode_done_\@: .endif # output encrypted Bytes test %r10, %r10 - jl _partial_fill_\@ + jl .L_partial_fill_\@ mov %r13, %r12 mov $16, %r13 # Set r13 to be the number of bytes to write out sub %r12, %r13 - jmp _count_set_\@ -_partial_fill_\@: + jmp .L_count_set_\@ +.L_partial_fill_\@: mov \PLAIN_CYPH_LEN, %r13 -_count_set_\@: +.L_count_set_\@: vmovdqa %xmm9, %xmm0 vmovq %xmm0, %rax cmp $8, %r13 - jle _less_than_8_bytes_left_\@ + jle .L_less_than_8_bytes_left_\@ mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) add $8, \DATA_OFFSET psrldq $8, %xmm0 vmovq %xmm0, %rax sub $8, %r13 -_less_than_8_bytes_left_\@: +.L_less_than_8_bytes_left_\@: movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) add $1, \DATA_OFFSET shr $8, %rax sub $1, %r13 - jne _less_than_8_bytes_left_\@ -_partial_block_done_\@: + jne .L_less_than_8_bytes_left_\@ +.L_partial_block_done_\@: .endm # PARTIAL_BLOCK ############################################################################### @@ -1073,7 +1051,7 @@ _partial_block_done_\@: vmovdqa \XMM8, \T3 cmp $128, %r13 - jl _initial_blocks_done\@ # no need for precomputed constants + jl .L_initial_blocks_done\@ # no need for precomputed constants ############################################################################### # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i @@ -1215,7 +1193,7 @@ _partial_block_done_\@: ############################################################################### -_initial_blocks_done\@: +.L_initial_blocks_done\@: .endm @@ -2023,7 +2001,7 @@ SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) vmovdqa \XMM8, \T3 cmp $128, %r13 - jl _initial_blocks_done\@ # no need for precomputed constants + jl .L_initial_blocks_done\@ # no need for precomputed constants ############################################################################### # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i @@ -2167,7 +2145,7 @@ SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) ############################################################################### -_initial_blocks_done\@: +.L_initial_blocks_done\@: .endm diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S index 9243f6289d34..7c1abc513f34 100644 --- a/arch/x86/crypto/aria-aesni-avx-asm_64.S +++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S @@ -80,7 +80,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vmovdqu .Lshufb_16x16b, a0; \ + vmovdqu .Lshufb_16x16b(%rip), a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -132,7 +132,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vmovdqu .Lshufb_16x16b, a0; \ + vmovdqu .Lshufb_16x16b(%rip), a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -300,11 +300,11 @@ x4, x5, x6, x7, \ t0, t1, t2, t3, \ t4, t5, t6, t7) \ - vmovdqa .Ltf_s2_bitmatrix, t0; \ - vmovdqa .Ltf_inv_bitmatrix, t1; \ - vmovdqa .Ltf_id_bitmatrix, t2; \ - vmovdqa .Ltf_aff_bitmatrix, t3; \ - vmovdqa .Ltf_x2_bitmatrix, t4; \ + vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \ + vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \ + vmovdqa .Ltf_id_bitmatrix(%rip), t2; \ + vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \ + vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ @@ -324,13 +324,13 @@ x4, x5, x6, x7, \ t0, t1, t2, t3, \ t4, t5, t6, t7) \ - vmovdqa .Linv_shift_row, t0; \ - vmovdqa .Lshift_row, t1; \ - vbroadcastss .L0f0f0f0f, t6; \ - vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \ - vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \ - vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \ - vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \ + vmovdqa .Linv_shift_row(%rip), t0; \ + vmovdqa .Lshift_row(%rip), t1; \ + vbroadcastss .L0f0f0f0f(%rip), t6; \ + vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \ + vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \ + vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \ + vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \ \ vaesenclast t7, x0, x0; \ vaesenclast t7, x4, x4; \ diff --git a/arch/x86/crypto/aria-aesni-avx2-asm_64.S b/arch/x86/crypto/aria-aesni-avx2-asm_64.S index 82a14b4ad920..c60fa2980630 100644 --- a/arch/x86/crypto/aria-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/aria-aesni-avx2-asm_64.S @@ -96,7 +96,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vbroadcasti128 .Lshufb_16x16b, a0; \ + vbroadcasti128 .Lshufb_16x16b(%rip), a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -148,7 +148,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vbroadcasti128 .Lshufb_16x16b, a0; \ + vbroadcasti128 .Lshufb_16x16b(%rip), a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -307,11 +307,11 @@ x4, x5, x6, x7, \ t0, t1, t2, t3, \ t4, t5, t6, t7) \ - vpbroadcastq .Ltf_s2_bitmatrix, t0; \ - vpbroadcastq .Ltf_inv_bitmatrix, t1; \ - vpbroadcastq .Ltf_id_bitmatrix, t2; \ - vpbroadcastq .Ltf_aff_bitmatrix, t3; \ - vpbroadcastq .Ltf_x2_bitmatrix, t4; \ + vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \ + vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \ + vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \ + vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \ + vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ @@ -332,12 +332,12 @@ t4, t5, t6, t7) \ vpxor t7, t7, t7; \ vpxor t6, t6, t6; \ - vbroadcasti128 .Linv_shift_row, t0; \ - vbroadcasti128 .Lshift_row, t1; \ - vbroadcasti128 .Ltf_lo__inv_aff__and__s2, t2; \ - vbroadcasti128 .Ltf_hi__inv_aff__and__s2, t3; \ - vbroadcasti128 .Ltf_lo__x2__and__fwd_aff, t4; \ - vbroadcasti128 .Ltf_hi__x2__and__fwd_aff, t5; \ + vbroadcasti128 .Linv_shift_row(%rip), t0; \ + vbroadcasti128 .Lshift_row(%rip), t1; \ + vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \ + vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \ + vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \ + vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \ \ vextracti128 $1, x0, t6##_x; \ vaesenclast t7##_x, x0##_x, x0##_x; \ @@ -369,7 +369,7 @@ vaesdeclast t7##_x, t6##_x, t6##_x; \ vinserti128 $1, t6##_x, x6, x6; \ \ - vpbroadcastd .L0f0f0f0f, t6; \ + vpbroadcastd .L0f0f0f0f(%rip), t6; \ \ /* AES inverse shift rows */ \ vpshufb t0, x0, x0; \ diff --git a/arch/x86/crypto/aria-gfni-avx512-asm_64.S b/arch/x86/crypto/aria-gfni-avx512-asm_64.S index 3193f0701450..860887e5d02e 100644 --- a/arch/x86/crypto/aria-gfni-avx512-asm_64.S +++ b/arch/x86/crypto/aria-gfni-avx512-asm_64.S @@ -80,7 +80,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vbroadcasti64x2 .Lshufb_16x16b, a0; \ + vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \ vmovdqu64 st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -132,7 +132,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vbroadcasti64x2 .Lshufb_16x16b, a0; \ + vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \ vmovdqu64 st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -308,11 +308,11 @@ x4, x5, x6, x7, \ t0, t1, t2, t3, \ t4, t5, t6, t7) \ - vpbroadcastq .Ltf_s2_bitmatrix, t0; \ - vpbroadcastq .Ltf_inv_bitmatrix, t1; \ - vpbroadcastq .Ltf_id_bitmatrix, t2; \ - vpbroadcastq .Ltf_aff_bitmatrix, t3; \ - vpbroadcastq .Ltf_x2_bitmatrix, t4; \ + vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \ + vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \ + vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \ + vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \ + vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ @@ -332,11 +332,11 @@ y4, y5, y6, y7, \ t0, t1, t2, t3, \ t4, t5, t6, t7) \ - vpbroadcastq .Ltf_s2_bitmatrix, t0; \ - vpbroadcastq .Ltf_inv_bitmatrix, t1; \ - vpbroadcastq .Ltf_id_bitmatrix, t2; \ - vpbroadcastq .Ltf_aff_bitmatrix, t3; \ - vpbroadcastq .Ltf_x2_bitmatrix, t4; \ + vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \ + vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \ + vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \ + vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \ + vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index 4a30618281ec..646477a13e11 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -52,10 +52,10 @@ /* \ * S-function with AES subbytes \ */ \ - vmovdqa .Linv_shift_row, t4; \ - vbroadcastss .L0f0f0f0f, t7; \ - vmovdqa .Lpre_tf_lo_s1, t0; \ - vmovdqa .Lpre_tf_hi_s1, t1; \ + vmovdqa .Linv_shift_row(%rip), t4; \ + vbroadcastss .L0f0f0f0f(%rip), t7; \ + vmovdqa .Lpre_tf_lo_s1(%rip), t0; \ + vmovdqa .Lpre_tf_hi_s1(%rip), t1; \ \ /* AES inverse shift rows */ \ vpshufb t4, x0, x0; \ @@ -68,8 +68,8 @@ vpshufb t4, x6, x6; \ \ /* prefilter sboxes 1, 2 and 3 */ \ - vmovdqa .Lpre_tf_lo_s4, t2; \ - vmovdqa .Lpre_tf_hi_s4, t3; \ + vmovdqa .Lpre_tf_lo_s4(%rip), t2; \ + vmovdqa .Lpre_tf_hi_s4(%rip), t3; \ filter_8bit(x0, t0, t1, t7, t6); \ filter_8bit(x7, t0, t1, t7, t6); \ filter_8bit(x1, t0, t1, t7, t6); \ @@ -83,8 +83,8 @@ filter_8bit(x6, t2, t3, t7, t6); \ \ /* AES subbytes + AES shift rows */ \ - vmovdqa .Lpost_tf_lo_s1, t0; \ - vmovdqa .Lpost_tf_hi_s1, t1; \ + vmovdqa .Lpost_tf_lo_s1(%rip), t0; \ + vmovdqa .Lpost_tf_hi_s1(%rip), t1; \ vaesenclast t4, x0, x0; \ vaesenclast t4, x7, x7; \ vaesenclast t4, x1, x1; \ @@ -95,16 +95,16 @@ vaesenclast t4, x6, x6; \ \ /* postfilter sboxes 1 and 4 */ \ - vmovdqa .Lpost_tf_lo_s3, t2; \ - vmovdqa .Lpost_tf_hi_s3, t3; \ + vmovdqa .Lpost_tf_lo_s3(%rip), t2; \ + vmovdqa .Lpost_tf_hi_s3(%rip), t3; \ filter_8bit(x0, t0, t1, t7, t6); \ filter_8bit(x7, t0, t1, t7, t6); \ filter_8bit(x3, t0, t1, t7, t6); \ filter_8bit(x6, t0, t1, t7, t6); \ \ /* postfilter sbox 3 */ \ - vmovdqa .Lpost_tf_lo_s2, t4; \ - vmovdqa .Lpost_tf_hi_s2, t5; \ + vmovdqa .Lpost_tf_lo_s2(%rip), t4; \ + vmovdqa .Lpost_tf_hi_s2(%rip), t5; \ filter_8bit(x2, t2, t3, t7, t6); \ filter_8bit(x5, t2, t3, t7, t6); \ \ @@ -443,7 +443,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vmovdqu .Lshufb_16x16b, a0; \ + vmovdqu .Lshufb_16x16b(%rip), a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -482,7 +482,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, rio, key) \ vmovq key, x0; \ - vpshufb .Lpack_bswap, x0, x0; \ + vpshufb .Lpack_bswap(%rip), x0, x0; \ \ vpxor 0 * 16(rio), x0, y7; \ vpxor 1 * 16(rio), x0, y6; \ @@ -533,7 +533,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vmovdqu x0, stack_tmp0; \ \ vmovq key, x0; \ - vpshufb .Lpack_bswap, x0, x0; \ + vpshufb .Lpack_bswap(%rip), x0, x0; \ \ vpxor x0, y7, y7; \ vpxor x0, y6, y6; \ diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index deaf62aa73a6..a0eb94e53b1b 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -64,12 +64,12 @@ /* \ * S-function with AES subbytes \ */ \ - vbroadcasti128 .Linv_shift_row, t4; \ - vpbroadcastd .L0f0f0f0f, t7; \ - vbroadcasti128 .Lpre_tf_lo_s1, t5; \ - vbroadcasti128 .Lpre_tf_hi_s1, t6; \ - vbroadcasti128 .Lpre_tf_lo_s4, t2; \ - vbroadcasti128 .Lpre_tf_hi_s4, t3; \ + vbroadcasti128 .Linv_shift_row(%rip), t4; \ + vpbroadcastd .L0f0f0f0f(%rip), t7; \ + vbroadcasti128 .Lpre_tf_lo_s1(%rip), t5; \ + vbroadcasti128 .Lpre_tf_hi_s1(%rip), t6; \ + vbroadcasti128 .Lpre_tf_lo_s4(%rip), t2; \ + vbroadcasti128 .Lpre_tf_hi_s4(%rip), t3; \ \ /* AES inverse shift rows */ \ vpshufb t4, x0, x0; \ @@ -115,8 +115,8 @@ vinserti128 $1, t2##_x, x6, x6; \ vextracti128 $1, x1, t3##_x; \ vextracti128 $1, x4, t2##_x; \ - vbroadcasti128 .Lpost_tf_lo_s1, t0; \ - vbroadcasti128 .Lpost_tf_hi_s1, t1; \ + vbroadcasti128 .Lpost_tf_lo_s1(%rip), t0; \ + vbroadcasti128 .Lpost_tf_hi_s1(%rip), t1; \ vaesenclast t4##_x, x2##_x, x2##_x; \ vaesenclast t4##_x, t6##_x, t6##_x; \ vinserti128 $1, t6##_x, x2, x2; \ @@ -131,16 +131,16 @@ vinserti128 $1, t2##_x, x4, x4; \ \ /* postfilter sboxes 1 and 4 */ \ - vbroadcasti128 .Lpost_tf_lo_s3, t2; \ - vbroadcasti128 .Lpost_tf_hi_s3, t3; \ + vbroadcasti128 .Lpost_tf_lo_s3(%rip), t2; \ + vbroadcasti128 .Lpost_tf_hi_s3(%rip), t3; \ filter_8bit(x0, t0, t1, t7, t6); \ filter_8bit(x7, t0, t1, t7, t6); \ filter_8bit(x3, t0, t1, t7, t6); \ filter_8bit(x6, t0, t1, t7, t6); \ \ /* postfilter sbox 3 */ \ - vbroadcasti128 .Lpost_tf_lo_s2, t4; \ - vbroadcasti128 .Lpost_tf_hi_s2, t5; \ + vbroadcasti128 .Lpost_tf_lo_s2(%rip), t4; \ + vbroadcasti128 .Lpost_tf_hi_s2(%rip), t5; \ filter_8bit(x2, t2, t3, t7, t6); \ filter_8bit(x5, t2, t3, t7, t6); \ \ @@ -475,7 +475,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vbroadcasti128 .Lshufb_16x16b, a0; \ + vbroadcasti128 .Lshufb_16x16b(%rip), a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -514,7 +514,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, rio, key) \ vpbroadcastq key, x0; \ - vpshufb .Lpack_bswap, x0, x0; \ + vpshufb .Lpack_bswap(%rip), x0, x0; \ \ vpxor 0 * 32(rio), x0, y7; \ vpxor 1 * 32(rio), x0, y6; \ @@ -565,7 +565,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vmovdqu x0, stack_tmp0; \ \ vpbroadcastq key, x0; \ - vpshufb .Lpack_bswap, x0, x0; \ + vpshufb .Lpack_bswap(%rip), x0, x0; \ \ vpxor x0, y7, y7; \ vpxor x0, y6, y6; \ diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 347c059f5940..816b6bb8bded 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -77,11 +77,13 @@ #define RXORbl %r9b #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ + leaq T0(%rip), tmp1; \ movzbl ab ## bl, tmp2 ## d; \ + xorq (tmp1, tmp2, 8), dst; \ + leaq T1(%rip), tmp2; \ movzbl ab ## bh, tmp1 ## d; \ rorq $16, ab; \ - xorq T0(, tmp2, 8), dst; \ - xorq T1(, tmp1, 8), dst; + xorq (tmp2, tmp1, 8), dst; /********************************************************************** 1-way camellia diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index 0326a01503c3..b4e460a87f18 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -84,15 +84,19 @@ #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ movzbl src ## bh, RID1d; \ + leaq s1(%rip), RID2; \ + movl (RID2,RID1,4), dst ## d; \ movzbl src ## bl, RID2d; \ + leaq s2(%rip), RID1; \ + op1 (RID1,RID2,4), dst ## d; \ shrq $16, src; \ - movl s1(, RID1, 4), dst ## d; \ - op1 s2(, RID2, 4), dst ## d; \ movzbl src ## bh, RID1d; \ + leaq s3(%rip), RID2; \ + op2 (RID2,RID1,4), dst ## d; \ movzbl src ## bl, RID2d; \ interleave_op(il_reg); \ - op2 s3(, RID1, 4), dst ## d; \ - op3 s4(, RID2, 4), dst ## d; + leaq s4(%rip), RID1; \ + op3 (RID1,RID2,4), dst ## d; #define dummy(d) /* do nothing */ @@ -151,15 +155,15 @@ subround(l ## 3, r ## 3, l ## 4, r ## 4, f); #define enc_preload_rkr() \ - vbroadcastss .L16_mask, RKR; \ + vbroadcastss .L16_mask(%rip), RKR; \ /* add 16-bit rotation to key rotations (mod 32) */ \ vpxor kr(CTX), RKR, RKR; #define dec_preload_rkr() \ - vbroadcastss .L16_mask, RKR; \ + vbroadcastss .L16_mask(%rip), RKR; \ /* add 16-bit rotation to key rotations (mod 32) */ \ vpxor kr(CTX), RKR, RKR; \ - vpshufb .Lbswap128_mask, RKR, RKR; + vpshufb .Lbswap128_mask(%rip), RKR, RKR; #define transpose_2x4(x0, x1, t0, t1) \ vpunpckldq x1, x0, t0; \ @@ -235,9 +239,9 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16) movq %rdi, CTX; - vmovdqa .Lbswap_mask, RKM; - vmovd .Lfirst_mask, R1ST; - vmovd .L32_mask, R32; + vmovdqa .Lbswap_mask(%rip), RKM; + vmovd .Lfirst_mask(%rip), R1ST; + vmovd .L32_mask(%rip), R32; enc_preload_rkr(); inpack_blocks(RL1, RR1, RTMP, RX, RKM); @@ -271,7 +275,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16) popq %rbx; popq %r15; - vmovdqa .Lbswap_mask, RKM; + vmovdqa .Lbswap_mask(%rip), RKM; outunpack_blocks(RR1, RL1, RTMP, RX, RKM); outunpack_blocks(RR2, RL2, RTMP, RX, RKM); @@ -308,9 +312,9 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16) movq %rdi, CTX; - vmovdqa .Lbswap_mask, RKM; - vmovd .Lfirst_mask, R1ST; - vmovd .L32_mask, R32; + vmovdqa .Lbswap_mask(%rip), RKM; + vmovd .Lfirst_mask(%rip), R1ST; + vmovd .L32_mask(%rip), R32; dec_preload_rkr(); inpack_blocks(RL1, RR1, RTMP, RX, RKM); @@ -341,7 +345,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16) round(RL, RR, 1, 2); round(RR, RL, 0, 1); - vmovdqa .Lbswap_mask, RKM; + vmovdqa .Lbswap_mask(%rip), RKM; popq %rbx; popq %r15; @@ -504,8 +508,8 @@ SYM_FUNC_START(cast5_ctr_16way) vpcmpeqd RKR, RKR, RKR; vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */ - vmovdqa .Lbswap_iv_mask, R1ST; - vmovdqa .Lbswap128_mask, RKM; + vmovdqa .Lbswap_iv_mask(%rip), R1ST; + vmovdqa .Lbswap128_mask(%rip), RKM; /* load IV and byteswap */ vmovq (%rcx), RX; diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 82b716fd5dba..9e86d460b409 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -84,15 +84,19 @@ #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ movzbl src ## bh, RID1d; \ + leaq s1(%rip), RID2; \ + movl (RID2,RID1,4), dst ## d; \ movzbl src ## bl, RID2d; \ + leaq s2(%rip), RID1; \ + op1 (RID1,RID2,4), dst ## d; \ shrq $16, src; \ - movl s1(, RID1, 4), dst ## d; \ - op1 s2(, RID2, 4), dst ## d; \ movzbl src ## bh, RID1d; \ + leaq s3(%rip), RID2; \ + op2 (RID2,RID1,4), dst ## d; \ movzbl src ## bl, RID2d; \ interleave_op(il_reg); \ - op2 s3(, RID1, 4), dst ## d; \ - op3 s4(, RID2, 4), dst ## d; + leaq s4(%rip), RID1; \ + op3 (RID1,RID2,4), dst ## d; #define dummy(d) /* do nothing */ @@ -175,10 +179,10 @@ qop(RD, RC, 1); #define shuffle(mask) \ - vpshufb mask, RKR, RKR; + vpshufb mask(%rip), RKR, RKR; #define preload_rkr(n, do_mask, mask) \ - vbroadcastss .L16_mask, RKR; \ + vbroadcastss .L16_mask(%rip), RKR; \ /* add 16-bit rotation to key rotations (mod 32) */ \ vpxor (kr+n*16)(CTX), RKR, RKR; \ do_mask(mask); @@ -258,9 +262,9 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8) movq %rdi, CTX; - vmovdqa .Lbswap_mask, RKM; - vmovd .Lfirst_mask, R1ST; - vmovd .L32_mask, R32; + vmovdqa .Lbswap_mask(%rip), RKM; + vmovd .Lfirst_mask(%rip), R1ST; + vmovd .L32_mask(%rip), R32; inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); @@ -284,7 +288,7 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8) popq %rbx; popq %r15; - vmovdqa .Lbswap_mask, RKM; + vmovdqa .Lbswap_mask(%rip), RKM; outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); @@ -306,9 +310,9 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8) movq %rdi, CTX; - vmovdqa .Lbswap_mask, RKM; - vmovd .Lfirst_mask, R1ST; - vmovd .L32_mask, R32; + vmovdqa .Lbswap_mask(%rip), RKM; + vmovd .Lfirst_mask(%rip), R1ST; + vmovd .L32_mask(%rip), R32; inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); @@ -332,7 +336,7 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8) popq %rbx; popq %r15; - vmovdqa .Lbswap_mask, RKM; + vmovdqa .Lbswap_mask(%rip), RKM; outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S index ca53e96996ac..5d31137e2c7d 100644 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ b/arch/x86/crypto/crc32-pclmul_asm.S @@ -90,7 +90,7 @@ SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligne sub $0x40, LEN add $0x40, BUF cmp $0x40, LEN - jb less_64 + jb .Lless_64 #ifdef __x86_64__ movdqa .Lconstant_R2R1(%rip), CONSTANT @@ -98,7 +98,7 @@ SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligne movdqa .Lconstant_R2R1, CONSTANT #endif -loop_64:/* 64 bytes Full cache line folding */ +.Lloop_64:/* 64 bytes Full cache line folding */ prefetchnta 0x40(BUF) movdqa %xmm1, %xmm5 movdqa %xmm2, %xmm6 @@ -139,8 +139,8 @@ loop_64:/* 64 bytes Full cache line folding */ sub $0x40, LEN add $0x40, BUF cmp $0x40, LEN - jge loop_64 -less_64:/* Folding cache line into 128bit */ + jge .Lloop_64 +.Lless_64:/* Folding cache line into 128bit */ #ifdef __x86_64__ movdqa .Lconstant_R4R3(%rip), CONSTANT #else @@ -167,8 +167,8 @@ less_64:/* Folding cache line into 128bit */ pxor %xmm4, %xmm1 cmp $0x10, LEN - jb fold_64 -loop_16:/* Folding rest buffer into 128bit */ + jb .Lfold_64 +.Lloop_16:/* Folding rest buffer into 128bit */ movdqa %xmm1, %xmm5 pclmulqdq $0x00, CONSTANT, %xmm1 pclmulqdq $0x11, CONSTANT, %xmm5 @@ -177,9 +177,9 @@ loop_16:/* Folding rest buffer into 128bit */ sub $0x10, LEN add $0x10, BUF cmp $0x10, LEN - jge loop_16 + jge .Lloop_16 -fold_64: +.Lfold_64: /* perform the last 64 bit fold, also adds 32 zeroes * to the input stream */ pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index ec35915f0901..81ce0f4db555 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -49,15 +49,15 @@ ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction .macro LABEL prefix n -\prefix\n\(): +.L\prefix\n\(): .endm .macro JMPTBL_ENTRY i -.quad crc_\i +.quad .Lcrc_\i .endm .macro JNC_LESS_THAN j - jnc less_than_\j + jnc .Lless_than_\j .endm # Define threshold where buffers are considered "small" and routed to more @@ -108,30 +108,30 @@ SYM_FUNC_START(crc_pcl) neg %bufp and $7, %bufp # calculate the unalignment amount of # the address - je proc_block # Skip if aligned + je .Lproc_block # Skip if aligned ## If len is less than 8 and we're unaligned, we need to jump ## to special code to avoid reading beyond the end of the buffer cmp $8, len - jae do_align + jae .Ldo_align # less_than_8 expects length in upper 3 bits of len_dw # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] shl $32-3+1, len_dw - jmp less_than_8_post_shl1 + jmp .Lless_than_8_post_shl1 -do_align: +.Ldo_align: #### Calculate CRC of unaligned bytes of the buffer (if any) movq (bufptmp), tmp # load a quadward from the buffer add %bufp, bufptmp # align buffer pointer for quadword # processing sub %bufp, len # update buffer length -align_loop: +.Lalign_loop: crc32b %bl, crc_init_dw # compute crc32 of 1-byte shr $8, tmp # get next byte dec %bufp - jne align_loop + jne .Lalign_loop -proc_block: +.Lproc_block: ################################################################ ## 2) PROCESS BLOCKS: @@ -141,11 +141,11 @@ proc_block: movq len, tmp # save num bytes in tmp cmpq $128*24, len - jae full_block + jae .Lfull_block -continue_block: +.Lcontinue_block: cmpq $SMALL_SIZE, len - jb small + jb .Lsmall ## len < 128*24 movq $2731, %rax # 2731 = ceil(2^16 / 24) @@ -168,13 +168,14 @@ continue_block: xor crc2, crc2 ## branch into array - mov jump_table(,%rax,8), %bufp + leaq jump_table(%rip), %bufp + mov (%bufp,%rax,8), %bufp JMP_NOSPEC bufp ################################################################ ## 2a) PROCESS FULL BLOCKS: ################################################################ -full_block: +.Lfull_block: movl $128,%eax lea 128*8*2(block_0), block_1 lea 128*8*3(block_0), block_2 @@ -189,7 +190,6 @@ full_block: ## 3) CRC Array: ################################################################ -crc_array: i=128 .rept 128-1 .altmacro @@ -242,28 +242,28 @@ LABEL crc_ 0 ENDBR mov tmp, len cmp $128*24, tmp - jae full_block + jae .Lfull_block cmp $24, tmp - jae continue_block + jae .Lcontinue_block -less_than_24: +.Lless_than_24: shl $32-4, len_dw # less_than_16 expects length # in upper 4 bits of len_dw - jnc less_than_16 + jnc .Lless_than_16 crc32q (bufptmp), crc_init crc32q 8(bufptmp), crc_init - jz do_return + jz .Ldo_return add $16, bufptmp # len is less than 8 if we got here # less_than_8 expects length in upper 3 bits of len_dw # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] shl $2, len_dw - jmp less_than_8_post_shl1 + jmp .Lless_than_8_post_shl1 ####################################################################### ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) ####################################################################### -small: +.Lsmall: shl $32-8, len_dw # Prepare len_dw for less_than_256 j=256 .rept 5 # j = {256, 128, 64, 32, 16} @@ -279,32 +279,32 @@ LABEL less_than_ %j # less_than_j: Length should be in crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data i=i+8 .endr - jz do_return # Return if remaining length is zero + jz .Ldo_return # Return if remaining length is zero add $j, bufptmp # Advance buf .endr -less_than_8: # Length should be stored in +.Lless_than_8: # Length should be stored in # upper 3 bits of len_dw shl $1, len_dw -less_than_8_post_shl1: - jnc less_than_4 +.Lless_than_8_post_shl1: + jnc .Lless_than_4 crc32l (bufptmp), crc_init_dw # CRC of 4 bytes - jz do_return # return if remaining data is zero + jz .Ldo_return # return if remaining data is zero add $4, bufptmp -less_than_4: # Length should be stored in +.Lless_than_4: # Length should be stored in # upper 2 bits of len_dw shl $1, len_dw - jnc less_than_2 + jnc .Lless_than_2 crc32w (bufptmp), crc_init_dw # CRC of 2 bytes - jz do_return # return if remaining data is zero + jz .Ldo_return # return if remaining data is zero add $2, bufptmp -less_than_2: # Length should be stored in the MSB +.Lless_than_2: # Length should be stored in the MSB # of len_dw shl $1, len_dw - jnc less_than_1 + jnc .Lless_than_1 crc32b (bufptmp), crc_init_dw # CRC of 1 byte -less_than_1: # Length should be zero -do_return: +.Lless_than_1: # Length should be zero +.Ldo_return: movq crc_init, %rax popq %rsi popq %rdi diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S index f4c760f4cade..cf21b998e77c 100644 --- a/arch/x86/crypto/des3_ede-asm_64.S +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -129,21 +129,29 @@ movzbl RW0bl, RT2d; \ movzbl RW0bh, RT3d; \ shrq $16, RW0; \ - movq s8(, RT0, 8), RT0; \ - xorq s6(, RT1, 8), to; \ + leaq s8(%rip), RW1; \ + movq (RW1, RT0, 8), RT0; \ + leaq s6(%rip), RW1; \ + xorq (RW1, RT1, 8), to; \ movzbl RW0bl, RL1d; \ movzbl RW0bh, RT1d; \ shrl $16, RW0d; \ - xorq s4(, RT2, 8), RT0; \ - xorq s2(, RT3, 8), to; \ + leaq s4(%rip), RW1; \ + xorq (RW1, RT2, 8), RT0; \ + leaq s2(%rip), RW1; \ + xorq (RW1, RT3, 8), to; \ movzbl RW0bl, RT2d; \ movzbl RW0bh, RT3d; \ - xorq s7(, RL1, 8), RT0; \ - xorq s5(, RT1, 8), to; \ - xorq s3(, RT2, 8), RT0; \ + leaq s7(%rip), RW1; \ + xorq (RW1, RL1, 8), RT0; \ + leaq s5(%rip), RW1; \ + xorq (RW1, RT1, 8), to; \ + leaq s3(%rip), RW1; \ + xorq (RW1, RT2, 8), RT0; \ load_next_key(n, RW0); \ xorq RT0, to; \ - xorq s1(, RT3, 8), to; \ + leaq s1(%rip), RW1; \ + xorq (RW1, RT3, 8), to; \ #define load_next_key(n, RWx) \ movq (((n) + 1) * 8)(CTX), RWx; @@ -355,65 +363,89 @@ SYM_FUNC_END(des3_ede_x86_64_crypt_blk) movzbl RW0bl, RT3d; \ movzbl RW0bh, RT1d; \ shrq $16, RW0; \ - xorq s8(, RT3, 8), to##0; \ - xorq s6(, RT1, 8), to##0; \ + leaq s8(%rip), RT2; \ + xorq (RT2, RT3, 8), to##0; \ + leaq s6(%rip), RT2; \ + xorq (RT2, RT1, 8), to##0; \ movzbl RW0bl, RT3d; \ movzbl RW0bh, RT1d; \ shrq $16, RW0; \ - xorq s4(, RT3, 8), to##0; \ - xorq s2(, RT1, 8), to##0; \ + leaq s4(%rip), RT2; \ + xorq (RT2, RT3, 8), to##0; \ + leaq s2(%rip), RT2; \ + xorq (RT2, RT1, 8), to##0; \ movzbl RW0bl, RT3d; \ movzbl RW0bh, RT1d; \ shrl $16, RW0d; \ - xorq s7(, RT3, 8), to##0; \ - xorq s5(, RT1, 8), to##0; \ + leaq s7(%rip), RT2; \ + xorq (RT2, RT3, 8), to##0; \ + leaq s5(%rip), RT2; \ + xorq (RT2, RT1, 8), to##0; \ movzbl RW0bl, RT3d; \ movzbl RW0bh, RT1d; \ load_next_key(n, RW0); \ - xorq s3(, RT3, 8), to##0; \ - xorq s1(, RT1, 8), to##0; \ + leaq s3(%rip), RT2; \ + xorq (RT2, RT3, 8), to##0; \ + leaq s1(%rip), RT2; \ + xorq (RT2, RT1, 8), to##0; \ xorq from##1, RW1; \ movzbl RW1bl, RT3d; \ movzbl RW1bh, RT1d; \ shrq $16, RW1; \ - xorq s8(, RT3, 8), to##1; \ - xorq s6(, RT1, 8), to##1; \ + leaq s8(%rip), RT2; \ + xorq (RT2, RT3, 8), to##1; \ + leaq s6(%rip), RT2; \ + xorq (RT2, RT1, 8), to##1; \ movzbl RW1bl, RT3d; \ movzbl RW1bh, RT1d; \ shrq $16, RW1; \ - xorq s4(, RT3, 8), to##1; \ - xorq s2(, RT1, 8), to##1; \ + leaq s4(%rip), RT2; \ + xorq (RT2, RT3, 8), to##1; \ + leaq s2(%rip), RT2; \ + xorq (RT2, RT1, 8), to##1; \ movzbl RW1bl, RT3d; \ movzbl RW1bh, RT1d; \ shrl $16, RW1d; \ - xorq s7(, RT3, 8), to##1; \ - xorq s5(, RT1, 8), to##1; \ + leaq s7(%rip), RT2; \ + xorq (RT2, RT3, 8), to##1; \ + leaq s5(%rip), RT2; \ + xorq (RT2, RT1, 8), to##1; \ movzbl RW1bl, RT3d; \ movzbl RW1bh, RT1d; \ do_movq(RW0, RW1); \ - xorq s3(, RT3, 8), to##1; \ - xorq s1(, RT1, 8), to##1; \ + leaq s3(%rip), RT2; \ + xorq (RT2, RT3, 8), to##1; \ + leaq s1(%rip), RT2; \ + xorq (RT2, RT1, 8), to##1; \ xorq from##2, RW2; \ movzbl RW2bl, RT3d; \ movzbl RW2bh, RT1d; \ shrq $16, RW2; \ - xorq s8(, RT3, 8), to##2; \ - xorq s6(, RT1, 8), to##2; \ + leaq s8(%rip), RT2; \ + xorq (RT2, RT3, 8), to##2; \ + leaq s6(%rip), RT2; \ + xorq (RT2, RT1, 8), to##2; \ movzbl RW2bl, RT3d; \ movzbl RW2bh, RT1d; \ shrq $16, RW2; \ - xorq s4(, RT3, 8), to##2; \ - xorq s2(, RT1, 8), to##2; \ + leaq s4(%rip), RT2; \ + xorq (RT2, RT3, 8), to##2; \ + leaq s2(%rip), RT2; \ + xorq (RT2, RT1, 8), to##2; \ movzbl RW2bl, RT3d; \ movzbl RW2bh, RT1d; \ shrl $16, RW2d; \ - xorq s7(, RT3, 8), to##2; \ - xorq s5(, RT1, 8), to##2; \ + leaq s7(%rip), RT2; \ + xorq (RT2, RT3, 8), to##2; \ + leaq s5(%rip), RT2; \ + xorq (RT2, RT1, 8), to##2; \ movzbl RW2bl, RT3d; \ movzbl RW2bh, RT1d; \ do_movq(RW0, RW2); \ - xorq s3(, RT3, 8), to##2; \ - xorq s1(, RT1, 8), to##2; + leaq s3(%rip), RT2; \ + xorq (RT2, RT3, 8), to##2; \ + leaq s1(%rip), RT2; \ + xorq (RT2, RT1, 8), to##2; #define __movq(src, dst) \ movq src, dst; diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 257ed9446f3e..99cb983ded9e 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -93,7 +93,7 @@ SYM_FUNC_START(clmul_ghash_mul) FRAME_BEGIN movups (%rdi), DATA movups (%rsi), SHASH - movaps .Lbswap_mask, BSWAP + movaps .Lbswap_mask(%rip), BSWAP pshufb BSWAP, DATA call __clmul_gf128mul_ble pshufb BSWAP, DATA @@ -110,7 +110,7 @@ SYM_FUNC_START(clmul_ghash_update) FRAME_BEGIN cmp $16, %rdx jb .Lupdate_just_ret # check length - movaps .Lbswap_mask, BSWAP + movaps .Lbswap_mask(%rip), BSWAP movups (%rdi), DATA movups (%rcx), SHASH pshufb BSWAP, DATA diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index a96b2fd26dab..4b49bdc95265 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -485,18 +485,18 @@ xchg WK_BUF, PRECALC_BUF .align 32 -_loop: +.L_loop: /* * code loops through more than one block * we use K_BASE value as a signal of a last block, * it is set below by: cmovae BUFFER_PTR, K_BASE */ test BLOCKS_CTR, BLOCKS_CTR - jnz _begin + jnz .L_begin .align 32 - jmp _end + jmp .L_end .align 32 -_begin: +.L_begin: /* * Do first block @@ -508,9 +508,6 @@ _begin: .set j, j+2 .endr - jmp _loop0 -_loop0: - /* * rounds: * 10,12,14,16,18 @@ -545,7 +542,7 @@ _loop0: UPDATE_HASH 16(HASH_PTR), E test BLOCKS_CTR, BLOCKS_CTR - jz _loop + jz .L_loop mov TB, B @@ -562,8 +559,6 @@ _loop0: .set j, j+2 .endr - jmp _loop1 -_loop1: /* * rounds * 20+80,22+80,24+80,26+80,28+80 @@ -574,9 +569,6 @@ _loop1: .set j, j+2 .endr - jmp _loop2 -_loop2: - /* * rounds * 40+80,42+80,44+80,46+80,48+80 @@ -592,9 +584,6 @@ _loop2: /* Move to the next block only if needed*/ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 - jmp _loop3 -_loop3: - /* * rounds * 60+80,62+80,64+80,66+80,68+80 @@ -623,10 +612,10 @@ _loop3: xchg WK_BUF, PRECALC_BUF - jmp _loop + jmp .L_loop .align 32 - _end: +.L_end: .endm /* diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 5555b5d5215a..53de72bdd851 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -360,7 +360,7 @@ SYM_TYPED_FUNC_START(sha256_transform_avx) and $~15, %rsp # align stack pointer shl $6, NUM_BLKS # convert to bytes - jz done_hash + jz .Ldone_hash add INP, NUM_BLKS # pointer to end of data mov NUM_BLKS, _INP_END(%rsp) @@ -377,7 +377,7 @@ SYM_TYPED_FUNC_START(sha256_transform_avx) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK vmovdqa _SHUF_00BA(%rip), SHUF_00BA vmovdqa _SHUF_DC00(%rip), SHUF_DC00 -loop0: +.Lloop0: lea K256(%rip), TBL ## byte swap first 16 dwords @@ -391,7 +391,7 @@ loop0: ## schedule 48 input dwords, by doing 3 rounds of 16 each mov $3, SRND .align 16 -loop1: +.Lloop1: vpaddd (TBL), X0, XFER vmovdqa XFER, _XFER(%rsp) FOUR_ROUNDS_AND_SCHED @@ -410,10 +410,10 @@ loop1: FOUR_ROUNDS_AND_SCHED sub $1, SRND - jne loop1 + jne .Lloop1 mov $2, SRND -loop2: +.Lloop2: vpaddd (TBL), X0, XFER vmovdqa XFER, _XFER(%rsp) DO_ROUND 0 @@ -433,7 +433,7 @@ loop2: vmovdqa X3, X1 sub $1, SRND - jne loop2 + jne .Lloop2 addm (4*0)(CTX),a addm (4*1)(CTX),b @@ -447,9 +447,9 @@ loop2: mov _INP(%rsp), INP add $64, INP cmp _INP_END(%rsp), INP - jne loop0 + jne .Lloop0 -done_hash: +.Ldone_hash: mov %rbp, %rsp popq %rbp diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 3eada9416852..9918212faf91 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -538,12 +538,12 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) and $-32, %rsp # align rsp to 32 byte boundary shl $6, NUM_BLKS # convert to bytes - jz done_hash + jz .Ldone_hash lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block mov NUM_BLKS, _INP_END(%rsp) cmp NUM_BLKS, INP - je only_one_block + je .Lonly_one_block ## load initial digest mov (CTX), a @@ -561,7 +561,7 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) mov CTX, _CTX(%rsp) -loop0: +.Lloop0: ## Load first 16 dwords from two blocks VMOVDQ 0*32(INP),XTMP0 VMOVDQ 1*32(INP),XTMP1 @@ -580,7 +580,7 @@ loop0: vperm2i128 $0x20, XTMP3, XTMP1, X2 vperm2i128 $0x31, XTMP3, XTMP1, X3 -last_block_enter: +.Llast_block_enter: add $64, INP mov INP, _INP(%rsp) @@ -588,34 +588,40 @@ last_block_enter: xor SRND, SRND .align 16 -loop1: - vpaddd K256+0*32(SRND), X0, XFER +.Lloop1: + leaq K256+0*32(%rip), INP ## reuse INP as scratch reg + vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 0*32 - vpaddd K256+1*32(SRND), X0, XFER + leaq K256+1*32(%rip), INP + vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 1*32 - vpaddd K256+2*32(SRND), X0, XFER + leaq K256+2*32(%rip), INP + vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 2*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 2*32 - vpaddd K256+3*32(SRND), X0, XFER + leaq K256+3*32(%rip), INP + vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 3*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 3*32 add $4*32, SRND cmp $3*4*32, SRND - jb loop1 + jb .Lloop1 -loop2: +.Lloop2: ## Do last 16 rounds with no scheduling - vpaddd K256+0*32(SRND), X0, XFER + leaq K256+0*32(%rip), INP + vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 0*32 - vpaddd K256+1*32(SRND), X1, XFER + leaq K256+1*32(%rip), INP + vpaddd (INP, SRND), X1, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 1*32 add $2*32, SRND @@ -624,7 +630,7 @@ loop2: vmovdqa X3, X1 cmp $4*4*32, SRND - jb loop2 + jb .Lloop2 mov _CTX(%rsp), CTX mov _INP(%rsp), INP @@ -639,17 +645,17 @@ loop2: addm (4*7)(CTX),h cmp _INP_END(%rsp), INP - ja done_hash + ja .Ldone_hash #### Do second block using previously scheduled results xor SRND, SRND .align 16 -loop3: +.Lloop3: DO_4ROUNDS _XFER + 0*32 + 16 DO_4ROUNDS _XFER + 1*32 + 16 add $2*32, SRND cmp $4*4*32, SRND - jb loop3 + jb .Lloop3 mov _CTX(%rsp), CTX mov _INP(%rsp), INP @@ -665,10 +671,10 @@ loop3: addm (4*7)(CTX),h cmp _INP_END(%rsp), INP - jb loop0 - ja done_hash + jb .Lloop0 + ja .Ldone_hash -do_last_block: +.Ldo_last_block: VMOVDQ 0*16(INP),XWORD0 VMOVDQ 1*16(INP),XWORD1 VMOVDQ 2*16(INP),XWORD2 @@ -679,9 +685,9 @@ do_last_block: vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 - jmp last_block_enter + jmp .Llast_block_enter -only_one_block: +.Lonly_one_block: ## load initial digest mov (4*0)(CTX),a @@ -698,9 +704,9 @@ only_one_block: vmovdqa _SHUF_DC00(%rip), SHUF_DC00 mov CTX, _CTX(%rsp) - jmp do_last_block + jmp .Ldo_last_block -done_hash: +.Ldone_hash: mov %rbp, %rsp pop %rbp diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index 959288eecc68..93264ee44543 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -369,7 +369,7 @@ SYM_TYPED_FUNC_START(sha256_transform_ssse3) and $~15, %rsp shl $6, NUM_BLKS # convert to bytes - jz done_hash + jz .Ldone_hash add INP, NUM_BLKS mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data @@ -387,7 +387,7 @@ SYM_TYPED_FUNC_START(sha256_transform_ssse3) movdqa _SHUF_00BA(%rip), SHUF_00BA movdqa _SHUF_DC00(%rip), SHUF_DC00 -loop0: +.Lloop0: lea K256(%rip), TBL ## byte swap first 16 dwords @@ -401,7 +401,7 @@ loop0: ## schedule 48 input dwords, by doing 3 rounds of 16 each mov $3, SRND .align 16 -loop1: +.Lloop1: movdqa (TBL), XFER paddd X0, XFER movdqa XFER, _XFER(%rsp) @@ -424,10 +424,10 @@ loop1: FOUR_ROUNDS_AND_SCHED sub $1, SRND - jne loop1 + jne .Lloop1 mov $2, SRND -loop2: +.Lloop2: paddd (TBL), X0 movdqa X0, _XFER(%rsp) DO_ROUND 0 @@ -446,7 +446,7 @@ loop2: movdqa X3, X1 sub $1, SRND - jne loop2 + jne .Lloop2 addm (4*0)(CTX),a addm (4*1)(CTX),b @@ -460,9 +460,9 @@ loop2: mov _INP(%rsp), INP add $64, INP cmp _INP_END(%rsp), INP - jne loop0 + jne .Lloop0 -done_hash: +.Ldone_hash: mov %rbp, %rsp popq %rbp diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index b0984f19fdb4..d902b8ea0721 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -276,7 +276,7 @@ frame_size = frame_WK + WK_SIZE ######################################################################## SYM_TYPED_FUNC_START(sha512_transform_avx) test msglen, msglen - je nowork + je .Lnowork # Save GPRs push %rbx @@ -291,7 +291,7 @@ SYM_TYPED_FUNC_START(sha512_transform_avx) sub $frame_size, %rsp and $~(0x20 - 1), %rsp -updateblock: +.Lupdateblock: # Load state variables mov DIGEST(0), a_64 @@ -348,7 +348,7 @@ updateblock: # Advance to next message block add $16*8, msg dec msglen - jnz updateblock + jnz .Lupdateblock # Restore Stack Pointer mov %rbp, %rsp @@ -361,7 +361,7 @@ updateblock: pop %r12 pop %rbx -nowork: +.Lnowork: RET SYM_FUNC_END(sha512_transform_avx) diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index b1ca99055ef9..f08496cd6870 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -581,7 +581,7 @@ SYM_TYPED_FUNC_START(sha512_transform_rorx) and $~(0x20 - 1), %rsp shl $7, NUM_BLKS # convert to bytes - jz done_hash + jz .Ldone_hash add INP, NUM_BLKS # pointer to end of data mov NUM_BLKS, frame_INPEND(%rsp) @@ -600,7 +600,7 @@ SYM_TYPED_FUNC_START(sha512_transform_rorx) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK -loop0: +.Lloop0: lea K512(%rip), TBL ## byte swap first 16 dwords @@ -615,7 +615,7 @@ loop0: movq $4, frame_SRND(%rsp) .align 16 -loop1: +.Lloop1: vpaddq (TBL), Y_0, XFER vmovdqa XFER, frame_XFER(%rsp) FOUR_ROUNDS_AND_SCHED @@ -634,10 +634,10 @@ loop1: FOUR_ROUNDS_AND_SCHED subq $1, frame_SRND(%rsp) - jne loop1 + jne .Lloop1 movq $2, frame_SRND(%rsp) -loop2: +.Lloop2: vpaddq (TBL), Y_0, XFER vmovdqa XFER, frame_XFER(%rsp) DO_4ROUNDS @@ -650,7 +650,7 @@ loop2: vmovdqa Y_3, Y_1 subq $1, frame_SRND(%rsp) - jne loop2 + jne .Lloop2 mov frame_CTX(%rsp), CTX2 addm 8*0(CTX2), a @@ -665,9 +665,9 @@ loop2: mov frame_INP(%rsp), INP add $128, INP cmp frame_INPEND(%rsp), INP - jne loop0 + jne .Lloop0 -done_hash: +.Ldone_hash: # Restore Stack Pointer mov %rbp, %rsp diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index c06afb5270e5..65be30156816 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -278,7 +278,7 @@ frame_size = frame_WK + WK_SIZE SYM_TYPED_FUNC_START(sha512_transform_ssse3) test msglen, msglen - je nowork + je .Lnowork # Save GPRs push %rbx @@ -293,7 +293,7 @@ SYM_TYPED_FUNC_START(sha512_transform_ssse3) sub $frame_size, %rsp and $~(0x20 - 1), %rsp -updateblock: +.Lupdateblock: # Load state variables mov DIGEST(0), a_64 @@ -350,7 +350,7 @@ updateblock: # Advance to next message block add $16*8, msg dec msglen - jnz updateblock + jnz .Lupdateblock # Restore Stack Pointer mov %rbp, %rsp @@ -363,7 +363,7 @@ updateblock: pop %r12 pop %rbx -nowork: +.Lnowork: RET SYM_FUNC_END(sha512_transform_ssse3) |