summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-26 08:32:52 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-26 08:32:52 -0700
commit733f7e9c18c5e377025c1bfdce6bc9a7d55649be (patch)
tree19adc4c70522756ef682181d58b231005fed5a32 /arch/x86
parent98f99e67a1dc456e9a542584819b2aa265ffc737 (diff)
parent482c84e906e535072c55395acabd3a58e9443d12 (diff)
Merge tag 'v6.4-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "API: - Total usage stats now include all that returned errors (instead of just some) - Remove maximum hash statesize limit - Add cloning support for hmac and unkeyed hashes - Demote BUG_ON in crypto_unregister_alg to a WARN_ON Algorithms: - Use RIP-relative addressing on x86 to prepare for PIE build - Add accelerated AES/GCM stitched implementation on powerpc P10 - Add some test vectors for cmac(camellia) - Remove failure case where jent is unavailable outside of FIPS mode in drbg - Add permanent and intermittent health error checks in jitter RNG Drivers: - Add support for 402xx devices in qat - Add support for HiSTB TRNG - Fix hash concurrency issues in stm32 - Add OP-TEE firmware support in caam" * tag 'v6.4-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (139 commits) i2c: designware: Add doorbell support for Mendocino i2c: designware: Use PCI PSP driver for communication powerpc: Move Power10 feature PPC_MODULE_FEATURE_P10 crypto: p10-aes-gcm - Remove POWER10_CPU dependency crypto: testmgr - Add some test vectors for cmac(camellia) crypto: cryptd - Add support for cloning hashes crypto: cryptd - Convert hash to use modern init_tfm/exit_tfm crypto: hmac - Add support for cloning crypto: hash - Add crypto_clone_ahash/shash crypto: api - Add crypto_clone_tfm crypto: api - Add crypto_tfm_get crypto: x86/sha - Use local .L symbols for code crypto: x86/crc32 - Use local .L symbols for code crypto: x86/aesni - Use local .L symbols for code crypto: x86/sha256 - Use RIP-relative addressing crypto: x86/ghash - Use RIP-relative addressing crypto: x86/des3 - Use RIP-relative addressing crypto: x86/crc32c - Use RIP-relative addressing crypto: x86/cast6 - Use RIP-relative addressing crypto: x86/cast5 - Use RIP-relative addressing ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/crypto/aegis128-aesni-asm.S6
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S198
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S254
-rw-r--r--arch/x86/crypto/aria-aesni-avx-asm_64.S28
-rw-r--r--arch/x86/crypto/aria-aesni-avx2-asm_64.S28
-rw-r--r--arch/x86/crypto/aria-gfni-avx512-asm_64.S24
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S30
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S30
-rw-r--r--arch/x86/crypto/camellia-x86_64-asm_64.S6
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S38
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S32
-rw-r--r--arch/x86/crypto/crc32-pclmul_asm.S16
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S70
-rw-r--r--arch/x86/crypto/des3_ede-asm_64.S96
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S4
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S25
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S16
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S54
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S16
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S8
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S16
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S8
-rw-r--r--arch/x86/kvm/svm/sev.c1
23 files changed, 510 insertions, 494 deletions
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index cdf3215ec272..ad7f4c891625 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -201,8 +201,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_init)
movdqa KEY, STATE4
/* load the constants: */
- movdqa .Laegis128_const_0, STATE2
- movdqa .Laegis128_const_1, STATE1
+ movdqa .Laegis128_const_0(%rip), STATE2
+ movdqa .Laegis128_const_1(%rip), STATE1
pxor STATE2, STATE3
pxor STATE1, STATE4
@@ -682,7 +682,7 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
punpcklbw T0, T0
punpcklbw T0, T0
punpcklbw T0, T0
- movdqa .Laegis128_counter, T1
+ movdqa .Laegis128_counter(%rip), T1
pcmpgtb T1, T0
pand T0, MSG
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 837c1e0aa021..3ac7487ecad2 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -288,53 +288,53 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
# Encrypt/Decrypt first few blocks
and $(3<<4), %r12
- jz _initial_num_blocks_is_0_\@
+ jz .L_initial_num_blocks_is_0_\@
cmp $(2<<4), %r12
- jb _initial_num_blocks_is_1_\@
- je _initial_num_blocks_is_2_\@
-_initial_num_blocks_is_3_\@:
+ jb .L_initial_num_blocks_is_1_\@
+ je .L_initial_num_blocks_is_2_\@
+.L_initial_num_blocks_is_3_\@:
INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
sub $48, %r13
- jmp _initial_blocks_\@
-_initial_num_blocks_is_2_\@:
+ jmp .L_initial_blocks_\@
+.L_initial_num_blocks_is_2_\@:
INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
sub $32, %r13
- jmp _initial_blocks_\@
-_initial_num_blocks_is_1_\@:
+ jmp .L_initial_blocks_\@
+.L_initial_num_blocks_is_1_\@:
INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
sub $16, %r13
- jmp _initial_blocks_\@
-_initial_num_blocks_is_0_\@:
+ jmp .L_initial_blocks_\@
+.L_initial_num_blocks_is_0_\@:
INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
-_initial_blocks_\@:
+.L_initial_blocks_\@:
# Main loop - Encrypt/Decrypt remaining blocks
test %r13, %r13
- je _zero_cipher_left_\@
+ je .L_zero_cipher_left_\@
sub $64, %r13
- je _four_cipher_left_\@
-_crypt_by_4_\@:
+ je .L_four_cipher_left_\@
+.L_crypt_by_4_\@:
GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
%xmm7, %xmm8, enc
add $64, %r11
sub $64, %r13
- jne _crypt_by_4_\@
-_four_cipher_left_\@:
+ jne .L_crypt_by_4_\@
+.L_four_cipher_left_\@:
GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
-_zero_cipher_left_\@:
+.L_zero_cipher_left_\@:
movdqu %xmm8, AadHash(%arg2)
movdqu %xmm0, CurCount(%arg2)
mov %arg5, %r13
and $15, %r13 # %r13 = arg5 (mod 16)
- je _multiple_of_16_bytes_\@
+ je .L_multiple_of_16_bytes_\@
mov %r13, PBlockLen(%arg2)
@@ -348,14 +348,14 @@ _zero_cipher_left_\@:
movdqu %xmm0, PBlockEncKey(%arg2)
cmp $16, %arg5
- jge _large_enough_update_\@
+ jge .L_large_enough_update_\@
lea (%arg4,%r11,1), %r10
mov %r13, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
- jmp _data_read_\@
+ jmp .L_data_read_\@
-_large_enough_update_\@:
+.L_large_enough_update_\@:
sub $16, %r11
add %r13, %r11
@@ -374,7 +374,7 @@ _large_enough_update_\@:
# shift right 16-r13 bytes
pshufb %xmm2, %xmm1
-_data_read_\@:
+.L_data_read_\@:
lea ALL_F+16(%rip), %r12
sub %r13, %r12
@@ -409,19 +409,19 @@ _data_read_\@:
# Output %r13 bytes
movq %xmm0, %rax
cmp $8, %r13
- jle _less_than_8_bytes_left_\@
+ jle .L_less_than_8_bytes_left_\@
mov %rax, (%arg3 , %r11, 1)
add $8, %r11
psrldq $8, %xmm0
movq %xmm0, %rax
sub $8, %r13
-_less_than_8_bytes_left_\@:
+.L_less_than_8_bytes_left_\@:
mov %al, (%arg3, %r11, 1)
add $1, %r11
shr $8, %rax
sub $1, %r13
- jne _less_than_8_bytes_left_\@
-_multiple_of_16_bytes_\@:
+ jne .L_less_than_8_bytes_left_\@
+.L_multiple_of_16_bytes_\@:
.endm
# GCM_COMPLETE Finishes update of tag of last partial block
@@ -434,11 +434,11 @@ _multiple_of_16_bytes_\@:
mov PBlockLen(%arg2), %r12
test %r12, %r12
- je _partial_done\@
+ je .L_partial_done\@
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-_partial_done\@:
+.L_partial_done\@:
mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
shl $3, %r12 # convert into number of bits
movd %r12d, %xmm15 # len(A) in %xmm15
@@ -457,44 +457,44 @@ _partial_done\@:
movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
pxor %xmm8, %xmm0
-_return_T_\@:
+.L_return_T_\@:
mov \AUTHTAG, %r10 # %r10 = authTag
mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
cmp $16, %r11
- je _T_16_\@
+ je .L_T_16_\@
cmp $8, %r11
- jl _T_4_\@
-_T_8_\@:
+ jl .L_T_4_\@
+.L_T_8_\@:
movq %xmm0, %rax
mov %rax, (%r10)
add $8, %r10
sub $8, %r11
psrldq $8, %xmm0
test %r11, %r11
- je _return_T_done_\@
-_T_4_\@:
+ je .L_return_T_done_\@
+.L_T_4_\@:
movd %xmm0, %eax
mov %eax, (%r10)
add $4, %r10
sub $4, %r11
psrldq $4, %xmm0
test %r11, %r11
- je _return_T_done_\@
-_T_123_\@:
+ je .L_return_T_done_\@
+.L_T_123_\@:
movd %xmm0, %eax
cmp $2, %r11
- jl _T_1_\@
+ jl .L_T_1_\@
mov %ax, (%r10)
cmp $2, %r11
- je _return_T_done_\@
+ je .L_return_T_done_\@
add $2, %r10
sar $16, %eax
-_T_1_\@:
+.L_T_1_\@:
mov %al, (%r10)
- jmp _return_T_done_\@
-_T_16_\@:
+ jmp .L_return_T_done_\@
+.L_T_16_\@:
movdqu %xmm0, (%r10)
-_return_T_done_\@:
+.L_return_T_done_\@:
.endm
#ifdef __x86_64__
@@ -563,30 +563,30 @@ _return_T_done_\@:
# Clobbers %rax, DLEN and XMM1
.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
cmp $8, \DLEN
- jl _read_lt8_\@
+ jl .L_read_lt8_\@
mov (\DPTR), %rax
movq %rax, \XMMDst
sub $8, \DLEN
- jz _done_read_partial_block_\@
+ jz .L_done_read_partial_block_\@
xor %eax, %eax
-_read_next_byte_\@:
+.L_read_next_byte_\@:
shl $8, %rax
mov 7(\DPTR, \DLEN, 1), %al
dec \DLEN
- jnz _read_next_byte_\@
+ jnz .L_read_next_byte_\@
movq %rax, \XMM1
pslldq $8, \XMM1
por \XMM1, \XMMDst
- jmp _done_read_partial_block_\@
-_read_lt8_\@:
+ jmp .L_done_read_partial_block_\@
+.L_read_lt8_\@:
xor %eax, %eax
-_read_next_byte_lt8_\@:
+.L_read_next_byte_lt8_\@:
shl $8, %rax
mov -1(\DPTR, \DLEN, 1), %al
dec \DLEN
- jnz _read_next_byte_lt8_\@
+ jnz .L_read_next_byte_lt8_\@
movq %rax, \XMMDst
-_done_read_partial_block_\@:
+.L_done_read_partial_block_\@:
.endm
# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
@@ -600,8 +600,8 @@ _done_read_partial_block_\@:
pxor \TMP6, \TMP6
cmp $16, %r11
- jl _get_AAD_rest\@
-_get_AAD_blocks\@:
+ jl .L_get_AAD_rest\@
+.L_get_AAD_blocks\@:
movdqu (%r10), \TMP7
pshufb %xmm14, \TMP7 # byte-reflect the AAD data
pxor \TMP7, \TMP6
@@ -609,14 +609,14 @@ _get_AAD_blocks\@:
add $16, %r10
sub $16, %r11
cmp $16, %r11
- jge _get_AAD_blocks\@
+ jge .L_get_AAD_blocks\@
movdqu \TMP6, \TMP7
/* read the last <16B of AAD */
-_get_AAD_rest\@:
+.L_get_AAD_rest\@:
test %r11, %r11
- je _get_AAD_done\@
+ je .L_get_AAD_done\@
READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
pshufb %xmm14, \TMP7 # byte-reflect the AAD data
@@ -624,7 +624,7 @@ _get_AAD_rest\@:
GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
movdqu \TMP7, \TMP6
-_get_AAD_done\@:
+.L_get_AAD_done\@:
movdqu \TMP6, AadHash(%arg2)
.endm
@@ -637,21 +637,21 @@ _get_AAD_done\@:
AAD_HASH operation
mov PBlockLen(%arg2), %r13
test %r13, %r13
- je _partial_block_done_\@ # Leave Macro if no partial blocks
+ je .L_partial_block_done_\@ # Leave Macro if no partial blocks
# Read in input data without over reading
cmp $16, \PLAIN_CYPH_LEN
- jl _fewer_than_16_bytes_\@
+ jl .L_fewer_than_16_bytes_\@
movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
- jmp _data_read_\@
+ jmp .L_data_read_\@
-_fewer_than_16_bytes_\@:
+.L_fewer_than_16_bytes_\@:
lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
mov \PLAIN_CYPH_LEN, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
mov PBlockLen(%arg2), %r13
-_data_read_\@: # Finished reading in data
+.L_data_read_\@: # Finished reading in data
movdqu PBlockEncKey(%arg2), %xmm9
movdqu HashKey(%arg2), %xmm13
@@ -674,9 +674,9 @@ _data_read_\@: # Finished reading in data
sub $16, %r10
# Determine if if partial block is not being filled and
# shift mask accordingly
- jge _no_extra_mask_1_\@
+ jge .L_no_extra_mask_1_\@
sub %r10, %r12
-_no_extra_mask_1_\@:
+.L_no_extra_mask_1_\@:
movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
# get the appropriate mask to mask out bottom r13 bytes of xmm9
@@ -689,17 +689,17 @@ _no_extra_mask_1_\@:
pxor %xmm3, \AAD_HASH
test %r10, %r10
- jl _partial_incomplete_1_\@
+ jl .L_partial_incomplete_1_\@
# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
xor %eax, %eax
mov %rax, PBlockLen(%arg2)
- jmp _dec_done_\@
-_partial_incomplete_1_\@:
+ jmp .L_dec_done_\@
+.L_partial_incomplete_1_\@:
add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
-_dec_done_\@:
+.L_dec_done_\@:
movdqu \AAD_HASH, AadHash(%arg2)
.else
pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
@@ -710,9 +710,9 @@ _dec_done_\@:
sub $16, %r10
# Determine if if partial block is not being filled and
# shift mask accordingly
- jge _no_extra_mask_2_\@
+ jge .L_no_extra_mask_2_\@
sub %r10, %r12
-_no_extra_mask_2_\@:
+.L_no_extra_mask_2_\@:
movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
# get the appropriate mask to mask out bottom r13 bytes of xmm9
@@ -724,17 +724,17 @@ _no_extra_mask_2_\@:
pxor %xmm9, \AAD_HASH
test %r10, %r10
- jl _partial_incomplete_2_\@
+ jl .L_partial_incomplete_2_\@
# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
xor %eax, %eax
mov %rax, PBlockLen(%arg2)
- jmp _encode_done_\@
-_partial_incomplete_2_\@:
+ jmp .L_encode_done_\@
+.L_partial_incomplete_2_\@:
add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
-_encode_done_\@:
+.L_encode_done_\@:
movdqu \AAD_HASH, AadHash(%arg2)
movdqa SHUF_MASK(%rip), %xmm10
@@ -744,32 +744,32 @@ _encode_done_\@:
.endif
# output encrypted Bytes
test %r10, %r10
- jl _partial_fill_\@
+ jl .L_partial_fill_\@
mov %r13, %r12
mov $16, %r13
# Set r13 to be the number of bytes to write out
sub %r12, %r13
- jmp _count_set_\@
-_partial_fill_\@:
+ jmp .L_count_set_\@
+.L_partial_fill_\@:
mov \PLAIN_CYPH_LEN, %r13
-_count_set_\@:
+.L_count_set_\@:
movdqa %xmm9, %xmm0
movq %xmm0, %rax
cmp $8, %r13
- jle _less_than_8_bytes_left_\@
+ jle .L_less_than_8_bytes_left_\@
mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $8, \DATA_OFFSET
psrldq $8, %xmm0
movq %xmm0, %rax
sub $8, %r13
-_less_than_8_bytes_left_\@:
+.L_less_than_8_bytes_left_\@:
movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $1, \DATA_OFFSET
shr $8, %rax
sub $1, %r13
- jne _less_than_8_bytes_left_\@
-_partial_block_done_\@:
+ jne .L_less_than_8_bytes_left_\@
+.L_partial_block_done_\@:
.endm # PARTIAL_BLOCK
/*
@@ -813,14 +813,14 @@ _partial_block_done_\@:
shr $2,%eax # 128->4, 192->6, 256->8
add $5,%eax # 128->9, 192->11, 256->13
-aes_loop_initial_\@:
+.Laes_loop_initial_\@:
MOVADQ (%r10),\TMP1
.irpc index, \i_seq
aesenc \TMP1, %xmm\index
.endr
add $16,%r10
sub $1,%eax
- jnz aes_loop_initial_\@
+ jnz .Laes_loop_initial_\@
MOVADQ (%r10), \TMP1
.irpc index, \i_seq
@@ -861,7 +861,7 @@ aes_loop_initial_\@:
GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.endif
cmp $64, %r13
- jl _initial_blocks_done\@
+ jl .L_initial_blocks_done\@
# no need for precomputed values
/*
*
@@ -908,18 +908,18 @@ aes_loop_initial_\@:
mov keysize,%eax
shr $2,%eax # 128->4, 192->6, 256->8
sub $4,%eax # 128->0, 192->2, 256->4
- jz aes_loop_pre_done\@
+ jz .Laes_loop_pre_done\@
-aes_loop_pre_\@:
+.Laes_loop_pre_\@:
MOVADQ (%r10),\TMP2
.irpc index, 1234
aesenc \TMP2, %xmm\index
.endr
add $16,%r10
sub $1,%eax
- jnz aes_loop_pre_\@
+ jnz .Laes_loop_pre_\@
-aes_loop_pre_done\@:
+.Laes_loop_pre_done\@:
MOVADQ (%r10), \TMP2
aesenclast \TMP2, \XMM1
aesenclast \TMP2, \XMM2
@@ -963,7 +963,7 @@ aes_loop_pre_done\@:
pshufb %xmm14, \XMM3 # perform a 16 byte swap
pshufb %xmm14, \XMM4 # perform a 16 byte swap
-_initial_blocks_done\@:
+.L_initial_blocks_done\@:
.endm
@@ -1095,18 +1095,18 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
mov keysize,%eax
shr $2,%eax # 128->4, 192->6, 256->8
sub $4,%eax # 128->0, 192->2, 256->4
- jz aes_loop_par_enc_done\@
+ jz .Laes_loop_par_enc_done\@
-aes_loop_par_enc\@:
+.Laes_loop_par_enc\@:
MOVADQ (%r10),\TMP3
.irpc index, 1234
aesenc \TMP3, %xmm\index
.endr
add $16,%r10
sub $1,%eax
- jnz aes_loop_par_enc\@
+ jnz .Laes_loop_par_enc\@
-aes_loop_par_enc_done\@:
+.Laes_loop_par_enc_done\@:
MOVADQ (%r10), \TMP3
aesenclast \TMP3, \XMM1 # Round 10
aesenclast \TMP3, \XMM2
@@ -1303,18 +1303,18 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
mov keysize,%eax
shr $2,%eax # 128->4, 192->6, 256->8
sub $4,%eax # 128->0, 192->2, 256->4
- jz aes_loop_par_dec_done\@
+ jz .Laes_loop_par_dec_done\@
-aes_loop_par_dec\@:
+.Laes_loop_par_dec\@:
MOVADQ (%r10),\TMP3
.irpc index, 1234
aesenc \TMP3, %xmm\index
.endr
add $16,%r10
sub $1,%eax
- jnz aes_loop_par_dec\@
+ jnz .Laes_loop_par_dec\@
-aes_loop_par_dec_done\@:
+.Laes_loop_par_dec_done\@:
MOVADQ (%r10), \TMP3
aesenclast \TMP3, \XMM1 # last round
aesenclast \TMP3, \XMM2
@@ -2717,7 +2717,7 @@ SYM_FUNC_END(aesni_cts_cbc_dec)
* BSWAP_MASK == endian swapping mask
*/
SYM_FUNC_START_LOCAL(_aesni_inc_init)
- movaps .Lbswap_mask, BSWAP_MASK
+ movaps .Lbswap_mask(%rip), BSWAP_MASK
movaps IV, CTR
pshufb BSWAP_MASK, CTR
mov $1, TCTR_LOW
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 0852ab573fd3..46cddd78857b 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -154,30 +154,6 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.octa 0x00000000000000000000000000000000
-.section .rodata
-.align 16
-.type aad_shift_arr, @object
-.size aad_shift_arr, 272
-aad_shift_arr:
- .octa 0xffffffffffffffffffffffffffffffff
- .octa 0xffffffffffffffffffffffffffffff0C
- .octa 0xffffffffffffffffffffffffffff0D0C
- .octa 0xffffffffffffffffffffffffff0E0D0C
- .octa 0xffffffffffffffffffffffff0F0E0D0C
- .octa 0xffffffffffffffffffffff0C0B0A0908
- .octa 0xffffffffffffffffffff0D0C0B0A0908
- .octa 0xffffffffffffffffff0E0D0C0B0A0908
- .octa 0xffffffffffffffff0F0E0D0C0B0A0908
- .octa 0xffffffffffffff0C0B0A090807060504
- .octa 0xffffffffffff0D0C0B0A090807060504
- .octa 0xffffffffff0E0D0C0B0A090807060504
- .octa 0xffffffff0F0E0D0C0B0A090807060504
- .octa 0xffffff0C0B0A09080706050403020100
- .octa 0xffff0D0C0B0A09080706050403020100
- .octa 0xff0E0D0C0B0A09080706050403020100
- .octa 0x0F0E0D0C0B0A09080706050403020100
-
-
.text
@@ -302,68 +278,68 @@ VARIABLE_OFFSET = 16*8
mov %r13, %r12
shr $4, %r12
and $7, %r12
- jz _initial_num_blocks_is_0\@
+ jz .L_initial_num_blocks_is_0\@
cmp $7, %r12
- je _initial_num_blocks_is_7\@
+ je .L_initial_num_blocks_is_7\@
cmp $6, %r12
- je _initial_num_blocks_is_6\@
+ je .L_initial_num_blocks_is_6\@
cmp $5, %r12
- je _initial_num_blocks_is_5\@
+ je .L_initial_num_blocks_is_5\@
cmp $4, %r12
- je _initial_num_blocks_is_4\@
+ je .L_initial_num_blocks_is_4\@
cmp $3, %r12
- je _initial_num_blocks_is_3\@
+ je .L_initial_num_blocks_is_3\@
cmp $2, %r12
- je _initial_num_blocks_is_2\@
+ je .L_initial_num_blocks_is_2\@
- jmp _initial_num_blocks_is_1\@
+ jmp .L_initial_num_blocks_is_1\@
-_initial_num_blocks_is_7\@:
+.L_initial_num_blocks_is_7\@:
\INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*7, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_6\@:
+.L_initial_num_blocks_is_6\@:
\INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*6, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_5\@:
+.L_initial_num_blocks_is_5\@:
\INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*5, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_4\@:
+.L_initial_num_blocks_is_4\@:
\INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*4, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_3\@:
+.L_initial_num_blocks_is_3\@:
\INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*3, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_2\@:
+.L_initial_num_blocks_is_2\@:
\INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*2, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_1\@:
+.L_initial_num_blocks_is_1\@:
\INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*1, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_0\@:
+.L_initial_num_blocks_is_0\@:
\INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-_initial_blocks_encrypted\@:
+.L_initial_blocks_encrypted\@:
test %r13, %r13
- je _zero_cipher_left\@
+ je .L_zero_cipher_left\@
sub $128, %r13
- je _eight_cipher_left\@
+ je .L_eight_cipher_left\@
@@ -373,9 +349,9 @@ _initial_blocks_encrypted\@:
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-_encrypt_by_8_new\@:
+.L_encrypt_by_8_new\@:
cmp $(255-8), %r15d
- jg _encrypt_by_8\@
+ jg .L_encrypt_by_8\@
@@ -383,30 +359,30 @@ _encrypt_by_8_new\@:
\GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
add $128, %r11
sub $128, %r13
- jne _encrypt_by_8_new\@
+ jne .L_encrypt_by_8_new\@
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- jmp _eight_cipher_left\@
+ jmp .L_eight_cipher_left\@
-_encrypt_by_8\@:
+.L_encrypt_by_8\@:
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
add $8, %r15b
\GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
add $128, %r11
sub $128, %r13
- jne _encrypt_by_8_new\@
+ jne .L_encrypt_by_8_new\@
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-_eight_cipher_left\@:
+.L_eight_cipher_left\@:
\GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
-_zero_cipher_left\@:
+.L_zero_cipher_left\@:
vmovdqu %xmm14, AadHash(arg2)
vmovdqu %xmm9, CurCount(arg2)
@@ -414,7 +390,7 @@ _zero_cipher_left\@:
mov arg5, %r13
and $15, %r13 # r13 = (arg5 mod 16)
- je _multiple_of_16_bytes\@
+ je .L_multiple_of_16_bytes\@
# handle the last <16 Byte block separately
@@ -428,7 +404,7 @@ _zero_cipher_left\@:
vmovdqu %xmm9, PBlockEncKey(arg2)
cmp $16, arg5
- jge _large_enough_update\@
+ jge .L_large_enough_update\@
lea (arg4,%r11,1), %r10
mov %r13, %r12
@@ -440,9 +416,9 @@ _zero_cipher_left\@:
# able to shift 16-r13 bytes (r13 is the
# number of bytes in plaintext mod 16)
- jmp _final_ghash_mul\@
+ jmp .L_final_ghash_mul\@
-_large_enough_update\@:
+.L_large_enough_update\@:
sub $16, %r11
add %r13, %r11
@@ -461,7 +437,7 @@ _large_enough_update\@:
# shift right 16-r13 bytes
vpshufb %xmm2, %xmm1, %xmm1
-_final_ghash_mul\@:
+.L_final_ghash_mul\@:
.if \ENC_DEC == DEC
vmovdqa %xmm1, %xmm2
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
@@ -490,7 +466,7 @@ _final_ghash_mul\@:
# output r13 Bytes
vmovq %xmm9, %rax
cmp $8, %r13
- jle _less_than_8_bytes_left\@
+ jle .L_less_than_8_bytes_left\@
mov %rax, (arg3 , %r11)
add $8, %r11
@@ -498,15 +474,15 @@ _final_ghash_mul\@:
vmovq %xmm9, %rax
sub $8, %r13
-_less_than_8_bytes_left\@:
+.L_less_than_8_bytes_left\@:
movb %al, (arg3 , %r11)
add $1, %r11
shr $8, %rax
sub $1, %r13
- jne _less_than_8_bytes_left\@
+ jne .L_less_than_8_bytes_left\@
#############################
-_multiple_of_16_bytes\@:
+.L_multiple_of_16_bytes\@:
.endm
@@ -519,12 +495,12 @@ _multiple_of_16_bytes\@:
mov PBlockLen(arg2), %r12
test %r12, %r12
- je _partial_done\@
+ je .L_partial_done\@
#GHASH computation for the last <16 Byte block
\GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-_partial_done\@:
+.L_partial_done\@:
mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
shl $3, %r12 # convert into number of bits
vmovd %r12d, %xmm15 # len(A) in xmm15
@@ -547,49 +523,49 @@ _partial_done\@:
-_return_T\@:
+.L_return_T\@:
mov \AUTH_TAG, %r10 # r10 = authTag
mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
cmp $16, %r11
- je _T_16\@
+ je .L_T_16\@
cmp $8, %r11
- jl _T_4\@
+ jl .L_T_4\@
-_T_8\@:
+.L_T_8\@:
vmovq %xmm9, %rax
mov %rax, (%r10)
add $8, %r10
sub $8, %r11
vpsrldq $8, %xmm9, %xmm9
test %r11, %r11
- je _return_T_done\@
-_T_4\@:
+ je .L_return_T_done\@
+.L_T_4\@:
vmovd %xmm9, %eax
mov %eax, (%r10)
add $4, %r10
sub $4, %r11
vpsrldq $4, %xmm9, %xmm9
test %r11, %r11
- je _return_T_done\@
-_T_123\@:
+ je .L_return_T_done\@
+.L_T_123\@:
vmovd %xmm9, %eax
cmp $2, %r11
- jl _T_1\@
+ jl .L_T_1\@
mov %ax, (%r10)
cmp $2, %r11
- je _return_T_done\@
+ je .L_return_T_done\@
add $2, %r10
sar $16, %eax
-_T_1\@:
+.L_T_1\@:
mov %al, (%r10)
- jmp _return_T_done\@
+ jmp .L_return_T_done\@
-_T_16\@:
+.L_T_16\@:
vmovdqu %xmm9, (%r10)
-_return_T_done\@:
+.L_return_T_done\@:
.endm
.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
@@ -603,8 +579,8 @@ _return_T_done\@:
vpxor \T8, \T8, \T8
vpxor \T7, \T7, \T7
cmp $16, %r11
- jl _get_AAD_rest8\@
-_get_AAD_blocks\@:
+ jl .L_get_AAD_rest8\@
+.L_get_AAD_blocks\@:
vmovdqu (%r10), \T7
vpshufb SHUF_MASK(%rip), \T7, \T7
vpxor \T7, \T8, \T8
@@ -613,29 +589,29 @@ _get_AAD_blocks\@:
sub $16, %r12
sub $16, %r11
cmp $16, %r11
- jge _get_AAD_blocks\@
+ jge .L_get_AAD_blocks\@
vmovdqu \T8, \T7
test %r11, %r11
- je _get_AAD_done\@
+ je .L_get_AAD_done\@
vpxor \T7, \T7, \T7
/* read the last <16B of AAD. since we have at least 4B of
data right after the AAD (the ICV, and maybe some CT), we can
read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\@:
+.L_get_AAD_rest8\@:
cmp $4, %r11
- jle _get_AAD_rest4\@
+ jle .L_get_AAD_rest4\@
movq (%r10), \T1
add $8, %r10
sub $8, %r11
vpslldq $8, \T1, \T1
vpsrldq $8, \T7, \T7
vpxor \T1, \T7, \T7
- jmp _get_AAD_rest8\@
-_get_AAD_rest4\@:
+ jmp .L_get_AAD_rest8\@
+.L_get_AAD_rest4\@:
test %r11, %r11
- jle _get_AAD_rest0\@
+ jle .L_get_AAD_rest0\@
mov (%r10), %eax
movq %rax, \T1
add $4, %r10
@@ -643,20 +619,22 @@ _get_AAD_rest4\@:
vpslldq $12, \T1, \T1
vpsrldq $4, \T7, \T7
vpxor \T1, \T7, \T7
-_get_AAD_rest0\@:
+.L_get_AAD_rest0\@:
/* finalize: shift out the extra bytes we read, and align
left. since pslldq can only shift by an immediate, we use
- vpshufb and an array of shuffle masks */
- movq %r12, %r11
- salq $4, %r11
- vmovdqu aad_shift_arr(%r11), \T1
- vpshufb \T1, \T7, \T7
-_get_AAD_rest_final\@:
+ vpshufb and a pair of shuffle masks */
+ leaq ALL_F(%rip), %r11
+ subq %r12, %r11
+ vmovdqu 16(%r11), \T1
+ andq $~3, %r11
+ vpshufb (%r11), \T7, \T7
+ vpand \T1, \T7, \T7
+.L_get_AAD_rest_final\@:
vpshufb SHUF_MASK(%rip), \T7, \T7
vpxor \T8, \T7, \T7
\GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
-_get_AAD_done\@:
+.L_get_AAD_done\@:
vmovdqu \T7, AadHash(arg2)
.endm
@@ -707,28 +685,28 @@ _get_AAD_done\@:
vpxor \XMMDst, \XMMDst, \XMMDst
cmp $8, \DLEN
- jl _read_lt8_\@
+ jl .L_read_lt8_\@
mov (\DPTR), %rax
vpinsrq $0, %rax, \XMMDst, \XMMDst
sub $8, \DLEN
- jz _done_read_partial_block_\@
+ jz .L_done_read_partial_block_\@
xor %eax, %eax
-_read_next_byte_\@:
+.L_read_next_byte_\@:
shl $8, %rax
mov 7(\DPTR, \DLEN, 1), %al
dec \DLEN
- jnz _read_next_byte_\@
+ jnz .L_read_next_byte_\@
vpinsrq $1, %rax, \XMMDst, \XMMDst
- jmp _done_read_partial_block_\@
-_read_lt8_\@:
+ jmp .L_done_read_partial_block_\@
+.L_read_lt8_\@:
xor %eax, %eax
-_read_next_byte_lt8_\@:
+.L_read_next_byte_lt8_\@:
shl $8, %rax
mov -1(\DPTR, \DLEN, 1), %al
dec \DLEN
- jnz _read_next_byte_lt8_\@
+ jnz .L_read_next_byte_lt8_\@
vpinsrq $0, %rax, \XMMDst, \XMMDst
-_done_read_partial_block_\@:
+.L_done_read_partial_block_\@:
.endm
# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
@@ -740,21 +718,21 @@ _done_read_partial_block_\@:
AAD_HASH ENC_DEC
mov PBlockLen(arg2), %r13
test %r13, %r13
- je _partial_block_done_\@ # Leave Macro if no partial blocks
+ je .L_partial_block_done_\@ # Leave Macro if no partial blocks
# Read in input data without over reading
cmp $16, \PLAIN_CYPH_LEN
- jl _fewer_than_16_bytes_\@
+ jl .L_fewer_than_16_bytes_\@
vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
- jmp _data_read_\@
+ jmp .L_data_read_\@
-_fewer_than_16_bytes_\@:
+.L_fewer_than_16_bytes_\@:
lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
mov \PLAIN_CYPH_LEN, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm1
mov PBlockLen(arg2), %r13
-_data_read_\@: # Finished reading in data
+.L_data_read_\@: # Finished reading in data
vmovdqu PBlockEncKey(arg2), %xmm9
vmovdqu HashKey(arg2), %xmm13
@@ -777,9 +755,9 @@ _data_read_\@: # Finished reading in data
sub $16, %r10
# Determine if if partial block is not being filled and
# shift mask accordingly
- jge _no_extra_mask_1_\@
+ jge .L_no_extra_mask_1_\@
sub %r10, %r12
-_no_extra_mask_1_\@:
+.L_no_extra_mask_1_\@:
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
# get the appropriate mask to mask out bottom r13 bytes of xmm9
@@ -792,17 +770,17 @@ _no_extra_mask_1_\@:
vpxor %xmm3, \AAD_HASH, \AAD_HASH
test %r10, %r10
- jl _partial_incomplete_1_\@
+ jl .L_partial_incomplete_1_\@
# GHASH computation for the last <16 Byte block
\GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
xor %eax,%eax
mov %rax, PBlockLen(arg2)
- jmp _dec_done_\@
-_partial_incomplete_1_\@:
+ jmp .L_dec_done_\@
+.L_partial_incomplete_1_\@:
add \PLAIN_CYPH_LEN, PBlockLen(arg2)
-_dec_done_\@:
+.L_dec_done_\@:
vmovdqu \AAD_HASH, AadHash(arg2)
.else
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
@@ -813,9 +791,9 @@ _dec_done_\@:
sub $16, %r10
# Determine if if partial block is not being filled and
# shift mask accordingly
- jge _no_extra_mask_2_\@
+ jge .L_no_extra_mask_2_\@
sub %r10, %r12
-_no_extra_mask_2_\@:
+.L_no_extra_mask_2_\@:
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
# get the appropriate mask to mask out bottom r13 bytes of xmm9
@@ -827,17 +805,17 @@ _no_extra_mask_2_\@:
vpxor %xmm9, \AAD_HASH, \AAD_HASH
test %r10, %r10
- jl _partial_incomplete_2_\@
+ jl .L_partial_incomplete_2_\@
# GHASH computation for the last <16 Byte block
\GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
xor %eax,%eax
mov %rax, PBlockLen(arg2)
- jmp _encode_done_\@
-_partial_incomplete_2_\@:
+ jmp .L_encode_done_\@
+.L_partial_incomplete_2_\@:
add \PLAIN_CYPH_LEN, PBlockLen(arg2)
-_encode_done_\@:
+.L_encode_done_\@:
vmovdqu \AAD_HASH, AadHash(arg2)
vmovdqa SHUF_MASK(%rip), %xmm10
@@ -847,32 +825,32 @@ _encode_done_\@:
.endif
# output encrypted Bytes
test %r10, %r10
- jl _partial_fill_\@
+ jl .L_partial_fill_\@
mov %r13, %r12
mov $16, %r13
# Set r13 to be the number of bytes to write out
sub %r12, %r13
- jmp _count_set_\@
-_partial_fill_\@:
+ jmp .L_count_set_\@
+.L_partial_fill_\@:
mov \PLAIN_CYPH_LEN, %r13
-_count_set_\@:
+.L_count_set_\@:
vmovdqa %xmm9, %xmm0
vmovq %xmm0, %rax
cmp $8, %r13
- jle _less_than_8_bytes_left_\@
+ jle .L_less_than_8_bytes_left_\@
mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $8, \DATA_OFFSET
psrldq $8, %xmm0
vmovq %xmm0, %rax
sub $8, %r13
-_less_than_8_bytes_left_\@:
+.L_less_than_8_bytes_left_\@:
movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $1, \DATA_OFFSET
shr $8, %rax
sub $1, %r13
- jne _less_than_8_bytes_left_\@
-_partial_block_done_\@:
+ jne .L_less_than_8_bytes_left_\@
+.L_partial_block_done_\@:
.endm # PARTIAL_BLOCK
###############################################################################
@@ -1073,7 +1051,7 @@ _partial_block_done_\@:
vmovdqa \XMM8, \T3
cmp $128, %r13
- jl _initial_blocks_done\@ # no need for precomputed constants
+ jl .L_initial_blocks_done\@ # no need for precomputed constants
###############################################################################
# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
@@ -1215,7 +1193,7 @@ _partial_block_done_\@:
###############################################################################
-_initial_blocks_done\@:
+.L_initial_blocks_done\@:
.endm
@@ -2023,7 +2001,7 @@ SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
vmovdqa \XMM8, \T3
cmp $128, %r13
- jl _initial_blocks_done\@ # no need for precomputed constants
+ jl .L_initial_blocks_done\@ # no need for precomputed constants
###############################################################################
# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
@@ -2167,7 +2145,7 @@ SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
###############################################################################
-_initial_blocks_done\@:
+.L_initial_blocks_done\@:
.endm
diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S
index 9243f6289d34..7c1abc513f34 100644
--- a/arch/x86/crypto/aria-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S
@@ -80,7 +80,7 @@
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
- vmovdqu .Lshufb_16x16b, a0; \
+ vmovdqu .Lshufb_16x16b(%rip), a0; \
vmovdqu st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
@@ -132,7 +132,7 @@
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
- vmovdqu .Lshufb_16x16b, a0; \
+ vmovdqu .Lshufb_16x16b(%rip), a0; \
vmovdqu st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
@@ -300,11 +300,11 @@
x4, x5, x6, x7, \
t0, t1, t2, t3, \
t4, t5, t6, t7) \
- vmovdqa .Ltf_s2_bitmatrix, t0; \
- vmovdqa .Ltf_inv_bitmatrix, t1; \
- vmovdqa .Ltf_id_bitmatrix, t2; \
- vmovdqa .Ltf_aff_bitmatrix, t3; \
- vmovdqa .Ltf_x2_bitmatrix, t4; \
+ vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \
+ vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \
+ vmovdqa .Ltf_id_bitmatrix(%rip), t2; \
+ vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \
+ vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \
vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
@@ -324,13 +324,13 @@
x4, x5, x6, x7, \
t0, t1, t2, t3, \
t4, t5, t6, t7) \
- vmovdqa .Linv_shift_row, t0; \
- vmovdqa .Lshift_row, t1; \
- vbroadcastss .L0f0f0f0f, t6; \
- vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \
- vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \
- vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \
- vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \
+ vmovdqa .Linv_shift_row(%rip), t0; \
+ vmovdqa .Lshift_row(%rip), t1; \
+ vbroadcastss .L0f0f0f0f(%rip), t6; \
+ vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \
+ vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \
+ vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
+ vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
\
vaesenclast t7, x0, x0; \
vaesenclast t7, x4, x4; \
diff --git a/arch/x86/crypto/aria-aesni-avx2-asm_64.S b/arch/x86/crypto/aria-aesni-avx2-asm_64.S
index 82a14b4ad920..c60fa2980630 100644
--- a/arch/x86/crypto/aria-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/aria-aesni-avx2-asm_64.S
@@ -96,7 +96,7 @@
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
- vbroadcasti128 .Lshufb_16x16b, a0; \
+ vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
vmovdqu st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
@@ -148,7 +148,7 @@
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
- vbroadcasti128 .Lshufb_16x16b, a0; \
+ vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
vmovdqu st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
@@ -307,11 +307,11 @@
x4, x5, x6, x7, \
t0, t1, t2, t3, \
t4, t5, t6, t7) \
- vpbroadcastq .Ltf_s2_bitmatrix, t0; \
- vpbroadcastq .Ltf_inv_bitmatrix, t1; \
- vpbroadcastq .Ltf_id_bitmatrix, t2; \
- vpbroadcastq .Ltf_aff_bitmatrix, t3; \
- vpbroadcastq .Ltf_x2_bitmatrix, t4; \
+ vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
+ vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
+ vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
+ vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
+ vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
@@ -332,12 +332,12 @@
t4, t5, t6, t7) \
vpxor t7, t7, t7; \
vpxor t6, t6, t6; \
- vbroadcasti128 .Linv_shift_row, t0; \
- vbroadcasti128 .Lshift_row, t1; \
- vbroadcasti128 .Ltf_lo__inv_aff__and__s2, t2; \
- vbroadcasti128 .Ltf_hi__inv_aff__and__s2, t3; \
- vbroadcasti128 .Ltf_lo__x2__and__fwd_aff, t4; \
- vbroadcasti128 .Ltf_hi__x2__and__fwd_aff, t5; \
+ vbroadcasti128 .Linv_shift_row(%rip), t0; \
+ vbroadcasti128 .Lshift_row(%rip), t1; \
+ vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \
+ vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \
+ vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
+ vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
\
vextracti128 $1, x0, t6##_x; \
vaesenclast t7##_x, x0##_x, x0##_x; \
@@ -369,7 +369,7 @@
vaesdeclast t7##_x, t6##_x, t6##_x; \
vinserti128 $1, t6##_x, x6, x6; \
\
- vpbroadcastd .L0f0f0f0f, t6; \
+ vpbroadcastd .L0f0f0f0f(%rip), t6; \
\
/* AES inverse shift rows */ \
vpshufb t0, x0, x0; \
diff --git a/arch/x86/crypto/aria-gfni-avx512-asm_64.S b/arch/x86/crypto/aria-gfni-avx512-asm_64.S
index 3193f0701450..860887e5d02e 100644
--- a/arch/x86/crypto/aria-gfni-avx512-asm_64.S
+++ b/arch/x86/crypto/aria-gfni-avx512-asm_64.S
@@ -80,7 +80,7 @@
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
- vbroadcasti64x2 .Lshufb_16x16b, a0; \
+ vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
vmovdqu64 st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
@@ -132,7 +132,7 @@
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
- vbroadcasti64x2 .Lshufb_16x16b, a0; \
+ vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
vmovdqu64 st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
@@ -308,11 +308,11 @@
x4, x5, x6, x7, \
t0, t1, t2, t3, \
t4, t5, t6, t7) \
- vpbroadcastq .Ltf_s2_bitmatrix, t0; \
- vpbroadcastq .Ltf_inv_bitmatrix, t1; \
- vpbroadcastq .Ltf_id_bitmatrix, t2; \
- vpbroadcastq .Ltf_aff_bitmatrix, t3; \
- vpbroadcastq .Ltf_x2_bitmatrix, t4; \
+ vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
+ vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
+ vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
+ vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
+ vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
@@ -332,11 +332,11 @@
y4, y5, y6, y7, \
t0, t1, t2, t3, \
t4, t5, t6, t7) \
- vpbroadcastq .Ltf_s2_bitmatrix, t0; \
- vpbroadcastq .Ltf_inv_bitmatrix, t1; \
- vpbroadcastq .Ltf_id_bitmatrix, t2; \
- vpbroadcastq .Ltf_aff_bitmatrix, t3; \
- vpbroadcastq .Ltf_x2_bitmatrix, t4; \
+ vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
+ vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
+ vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
+ vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
+ vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index 4a30618281ec..646477a13e11 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -52,10 +52,10 @@
/* \
* S-function with AES subbytes \
*/ \
- vmovdqa .Linv_shift_row, t4; \
- vbroadcastss .L0f0f0f0f, t7; \
- vmovdqa .Lpre_tf_lo_s1, t0; \
- vmovdqa .Lpre_tf_hi_s1, t1; \
+ vmovdqa .Linv_shift_row(%rip), t4; \
+ vbroadcastss .L0f0f0f0f(%rip), t7; \
+ vmovdqa .Lpre_tf_lo_s1(%rip), t0; \
+ vmovdqa .Lpre_tf_hi_s1(%rip), t1; \
\
/* AES inverse shift rows */ \
vpshufb t4, x0, x0; \
@@ -68,8 +68,8 @@
vpshufb t4, x6, x6; \
\
/* prefilter sboxes 1, 2 and 3 */ \
- vmovdqa .Lpre_tf_lo_s4, t2; \
- vmovdqa .Lpre_tf_hi_s4, t3; \
+ vmovdqa .Lpre_tf_lo_s4(%rip), t2; \
+ vmovdqa .Lpre_tf_hi_s4(%rip), t3; \
filter_8bit(x0, t0, t1, t7, t6); \
filter_8bit(x7, t0, t1, t7, t6); \
filter_8bit(x1, t0, t1, t7, t6); \
@@ -83,8 +83,8 @@
filter_8bit(x6, t2, t3, t7, t6); \
\
/* AES subbytes + AES shift rows */ \
- vmovdqa .Lpost_tf_lo_s1, t0; \
- vmovdqa .Lpost_tf_hi_s1, t1; \
+ vmovdqa .Lpost_tf_lo_s1(%rip), t0; \
+ vmovdqa .Lpost_tf_hi_s1(%rip), t1; \
vaesenclast t4, x0, x0; \
vaesenclast t4, x7, x7; \
vaesenclast t4, x1, x1; \
@@ -95,16 +95,16 @@
vaesenclast t4, x6, x6; \
\
/* postfilter sboxes 1 and 4 */ \
- vmovdqa .Lpost_tf_lo_s3, t2; \
- vmovdqa .Lpost_tf_hi_s3, t3; \
+ vmovdqa .Lpost_tf_lo_s3(%rip), t2; \
+ vmovdqa .Lpost_tf_hi_s3(%rip), t3; \
filter_8bit(x0, t0, t1, t7, t6); \
filter_8bit(x7, t0, t1, t7, t6); \
filter_8bit(x3, t0, t1, t7, t6); \
filter_8bit(x6, t0, t1, t7, t6); \
\
/* postfilter sbox 3 */ \
- vmovdqa .Lpost_tf_lo_s2, t4; \
- vmovdqa .Lpost_tf_hi_s2, t5; \
+ vmovdqa .Lpost_tf_lo_s2(%rip), t4; \
+ vmovdqa .Lpost_tf_hi_s2(%rip), t5; \
filter_8bit(x2, t2, t3, t7, t6); \
filter_8bit(x5, t2, t3, t7, t6); \
\
@@ -443,7 +443,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
- vmovdqu .Lshufb_16x16b, a0; \
+ vmovdqu .Lshufb_16x16b(%rip), a0; \
vmovdqu st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
@@ -482,7 +482,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
y6, y7, rio, key) \
vmovq key, x0; \
- vpshufb .Lpack_bswap, x0, x0; \
+ vpshufb .Lpack_bswap(%rip), x0, x0; \
\
vpxor 0 * 16(rio), x0, y7; \
vpxor 1 * 16(rio), x0, y6; \
@@ -533,7 +533,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
vmovdqu x0, stack_tmp0; \
\
vmovq key, x0; \
- vpshufb .Lpack_bswap, x0, x0; \
+ vpshufb .Lpack_bswap(%rip), x0, x0; \
\
vpxor x0, y7, y7; \
vpxor x0, y6, y6; \
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index deaf62aa73a6..a0eb94e53b1b 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -64,12 +64,12 @@
/* \
* S-function with AES subbytes \
*/ \
- vbroadcasti128 .Linv_shift_row, t4; \
- vpbroadcastd .L0f0f0f0f, t7; \
- vbroadcasti128 .Lpre_tf_lo_s1, t5; \
- vbroadcasti128 .Lpre_tf_hi_s1, t6; \
- vbroadcasti128 .Lpre_tf_lo_s4, t2; \
- vbroadcasti128 .Lpre_tf_hi_s4, t3; \
+ vbroadcasti128 .Linv_shift_row(%rip), t4; \
+ vpbroadcastd .L0f0f0f0f(%rip), t7; \
+ vbroadcasti128 .Lpre_tf_lo_s1(%rip), t5; \
+ vbroadcasti128 .Lpre_tf_hi_s1(%rip), t6; \
+ vbroadcasti128 .Lpre_tf_lo_s4(%rip), t2; \
+ vbroadcasti128 .Lpre_tf_hi_s4(%rip), t3; \
\
/* AES inverse shift rows */ \
vpshufb t4, x0, x0; \
@@ -115,8 +115,8 @@
vinserti128 $1, t2##_x, x6, x6; \
vextracti128 $1, x1, t3##_x; \
vextracti128 $1, x4, t2##_x; \
- vbroadcasti128 .Lpost_tf_lo_s1, t0; \
- vbroadcasti128 .Lpost_tf_hi_s1, t1; \
+ vbroadcasti128 .Lpost_tf_lo_s1(%rip), t0; \
+ vbroadcasti128 .Lpost_tf_hi_s1(%rip), t1; \
vaesenclast t4##_x, x2##_x, x2##_x; \
vaesenclast t4##_x, t6##_x, t6##_x; \
vinserti128 $1, t6##_x, x2, x2; \
@@ -131,16 +131,16 @@
vinserti128 $1, t2##_x, x4, x4; \
\
/* postfilter sboxes 1 and 4 */ \
- vbroadcasti128 .Lpost_tf_lo_s3, t2; \
- vbroadcasti128 .Lpost_tf_hi_s3, t3; \
+ vbroadcasti128 .Lpost_tf_lo_s3(%rip), t2; \
+ vbroadcasti128 .Lpost_tf_hi_s3(%rip), t3; \
filter_8bit(x0, t0, t1, t7, t6); \
filter_8bit(x7, t0, t1, t7, t6); \
filter_8bit(x3, t0, t1, t7, t6); \
filter_8bit(x6, t0, t1, t7, t6); \
\
/* postfilter sbox 3 */ \
- vbroadcasti128 .Lpost_tf_lo_s2, t4; \
- vbroadcasti128 .Lpost_tf_hi_s2, t5; \
+ vbroadcasti128 .Lpost_tf_lo_s2(%rip), t4; \
+ vbroadcasti128 .Lpost_tf_hi_s2(%rip), t5; \
filter_8bit(x2, t2, t3, t7, t6); \
filter_8bit(x5, t2, t3, t7, t6); \
\
@@ -475,7 +475,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
- vbroadcasti128 .Lshufb_16x16b, a0; \
+ vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
vmovdqu st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
@@ -514,7 +514,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
y6, y7, rio, key) \
vpbroadcastq key, x0; \
- vpshufb .Lpack_bswap, x0, x0; \
+ vpshufb .Lpack_bswap(%rip), x0, x0; \
\
vpxor 0 * 32(rio), x0, y7; \
vpxor 1 * 32(rio), x0, y6; \
@@ -565,7 +565,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
vmovdqu x0, stack_tmp0; \
\
vpbroadcastq key, x0; \
- vpshufb .Lpack_bswap, x0, x0; \
+ vpshufb .Lpack_bswap(%rip), x0, x0; \
\
vpxor x0, y7, y7; \
vpxor x0, y6, y6; \
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 347c059f5940..816b6bb8bded 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -77,11 +77,13 @@
#define RXORbl %r9b
#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
+ leaq T0(%rip), tmp1; \
movzbl ab ## bl, tmp2 ## d; \
+ xorq (tmp1, tmp2, 8), dst; \
+ leaq T1(%rip), tmp2; \
movzbl ab ## bh, tmp1 ## d; \
rorq $16, ab; \
- xorq T0(, tmp2, 8), dst; \
- xorq T1(, tmp1, 8), dst;
+ xorq (tmp2, tmp1, 8), dst;
/**********************************************************************
1-way camellia
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index 0326a01503c3..b4e460a87f18 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -84,15 +84,19 @@
#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
movzbl src ## bh, RID1d; \
+ leaq s1(%rip), RID2; \
+ movl (RID2,RID1,4), dst ## d; \
movzbl src ## bl, RID2d; \
+ leaq s2(%rip), RID1; \
+ op1 (RID1,RID2,4), dst ## d; \
shrq $16, src; \
- movl s1(, RID1, 4), dst ## d; \
- op1 s2(, RID2, 4), dst ## d; \
movzbl src ## bh, RID1d; \
+ leaq s3(%rip), RID2; \
+ op2 (RID2,RID1,4), dst ## d; \
movzbl src ## bl, RID2d; \
interleave_op(il_reg); \
- op2 s3(, RID1, 4), dst ## d; \
- op3 s4(, RID2, 4), dst ## d;
+ leaq s4(%rip), RID1; \
+ op3 (RID1,RID2,4), dst ## d;
#define dummy(d) /* do nothing */
@@ -151,15 +155,15 @@
subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
#define enc_preload_rkr() \
- vbroadcastss .L16_mask, RKR; \
+ vbroadcastss .L16_mask(%rip), RKR; \
/* add 16-bit rotation to key rotations (mod 32) */ \
vpxor kr(CTX), RKR, RKR;
#define dec_preload_rkr() \
- vbroadcastss .L16_mask, RKR; \
+ vbroadcastss .L16_mask(%rip), RKR; \
/* add 16-bit rotation to key rotations (mod 32) */ \
vpxor kr(CTX), RKR, RKR; \
- vpshufb .Lbswap128_mask, RKR, RKR;
+ vpshufb .Lbswap128_mask(%rip), RKR, RKR;
#define transpose_2x4(x0, x1, t0, t1) \
vpunpckldq x1, x0, t0; \
@@ -235,9 +239,9 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
movq %rdi, CTX;
- vmovdqa .Lbswap_mask, RKM;
- vmovd .Lfirst_mask, R1ST;
- vmovd .L32_mask, R32;
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ vmovd .Lfirst_mask(%rip), R1ST;
+ vmovd .L32_mask(%rip), R32;
enc_preload_rkr();
inpack_blocks(RL1, RR1, RTMP, RX, RKM);
@@ -271,7 +275,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
popq %rbx;
popq %r15;
- vmovdqa .Lbswap_mask, RKM;
+ vmovdqa .Lbswap_mask(%rip), RKM;
outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
@@ -308,9 +312,9 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
movq %rdi, CTX;
- vmovdqa .Lbswap_mask, RKM;
- vmovd .Lfirst_mask, R1ST;
- vmovd .L32_mask, R32;
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ vmovd .Lfirst_mask(%rip), R1ST;
+ vmovd .L32_mask(%rip), R32;
dec_preload_rkr();
inpack_blocks(RL1, RR1, RTMP, RX, RKM);
@@ -341,7 +345,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
round(RL, RR, 1, 2);
round(RR, RL, 0, 1);
- vmovdqa .Lbswap_mask, RKM;
+ vmovdqa .Lbswap_mask(%rip), RKM;
popq %rbx;
popq %r15;
@@ -504,8 +508,8 @@ SYM_FUNC_START(cast5_ctr_16way)
vpcmpeqd RKR, RKR, RKR;
vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
- vmovdqa .Lbswap_iv_mask, R1ST;
- vmovdqa .Lbswap128_mask, RKM;
+ vmovdqa .Lbswap_iv_mask(%rip), R1ST;
+ vmovdqa .Lbswap128_mask(%rip), RKM;
/* load IV and byteswap */
vmovq (%rcx), RX;
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 82b716fd5dba..9e86d460b409 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -84,15 +84,19 @@
#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
movzbl src ## bh, RID1d; \
+ leaq s1(%rip), RID2; \
+ movl (RID2,RID1,4), dst ## d; \
movzbl src ## bl, RID2d; \
+ leaq s2(%rip), RID1; \
+ op1 (RID1,RID2,4), dst ## d; \
shrq $16, src; \
- movl s1(, RID1, 4), dst ## d; \
- op1 s2(, RID2, 4), dst ## d; \
movzbl src ## bh, RID1d; \
+ leaq s3(%rip), RID2; \
+ op2 (RID2,RID1,4), dst ## d; \
movzbl src ## bl, RID2d; \
interleave_op(il_reg); \
- op2 s3(, RID1, 4), dst ## d; \
- op3 s4(, RID2, 4), dst ## d;
+ leaq s4(%rip), RID1; \
+ op3 (RID1,RID2,4), dst ## d;
#define dummy(d) /* do nothing */
@@ -175,10 +179,10 @@
qop(RD, RC, 1);
#define shuffle(mask) \
- vpshufb mask, RKR, RKR;
+ vpshufb mask(%rip), RKR, RKR;
#define preload_rkr(n, do_mask, mask) \
- vbroadcastss .L16_mask, RKR; \
+ vbroadcastss .L16_mask(%rip), RKR; \
/* add 16-bit rotation to key rotations (mod 32) */ \
vpxor (kr+n*16)(CTX), RKR, RKR; \
do_mask(mask);
@@ -258,9 +262,9 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
movq %rdi, CTX;
- vmovdqa .Lbswap_mask, RKM;
- vmovd .Lfirst_mask, R1ST;
- vmovd .L32_mask, R32;
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ vmovd .Lfirst_mask(%rip), R1ST;
+ vmovd .L32_mask(%rip), R32;
inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
@@ -284,7 +288,7 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
popq %rbx;
popq %r15;
- vmovdqa .Lbswap_mask, RKM;
+ vmovdqa .Lbswap_mask(%rip), RKM;
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
@@ -306,9 +310,9 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
movq %rdi, CTX;
- vmovdqa .Lbswap_mask, RKM;
- vmovd .Lfirst_mask, R1ST;
- vmovd .L32_mask, R32;
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ vmovd .Lfirst_mask(%rip), R1ST;
+ vmovd .L32_mask(%rip), R32;
inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
@@ -332,7 +336,7 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
popq %rbx;
popq %r15;
- vmovdqa .Lbswap_mask, RKM;
+ vmovdqa .Lbswap_mask(%rip), RKM;
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
index ca53e96996ac..5d31137e2c7d 100644
--- a/arch/x86/crypto/crc32-pclmul_asm.S
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -90,7 +90,7 @@ SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligne
sub $0x40, LEN
add $0x40, BUF
cmp $0x40, LEN
- jb less_64
+ jb .Lless_64
#ifdef __x86_64__
movdqa .Lconstant_R2R1(%rip), CONSTANT
@@ -98,7 +98,7 @@ SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligne
movdqa .Lconstant_R2R1, CONSTANT
#endif
-loop_64:/* 64 bytes Full cache line folding */
+.Lloop_64:/* 64 bytes Full cache line folding */
prefetchnta 0x40(BUF)
movdqa %xmm1, %xmm5
movdqa %xmm2, %xmm6
@@ -139,8 +139,8 @@ loop_64:/* 64 bytes Full cache line folding */
sub $0x40, LEN
add $0x40, BUF
cmp $0x40, LEN
- jge loop_64
-less_64:/* Folding cache line into 128bit */
+ jge .Lloop_64
+.Lless_64:/* Folding cache line into 128bit */
#ifdef __x86_64__
movdqa .Lconstant_R4R3(%rip), CONSTANT
#else
@@ -167,8 +167,8 @@ less_64:/* Folding cache line into 128bit */
pxor %xmm4, %xmm1
cmp $0x10, LEN
- jb fold_64
-loop_16:/* Folding rest buffer into 128bit */
+ jb .Lfold_64
+.Lloop_16:/* Folding rest buffer into 128bit */
movdqa %xmm1, %xmm5
pclmulqdq $0x00, CONSTANT, %xmm1
pclmulqdq $0x11, CONSTANT, %xmm5
@@ -177,9 +177,9 @@ loop_16:/* Folding rest buffer into 128bit */
sub $0x10, LEN
add $0x10, BUF
cmp $0x10, LEN
- jge loop_16
+ jge .Lloop_16
-fold_64:
+.Lfold_64:
/* perform the last 64 bit fold, also adds 32 zeroes
* to the input stream */
pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index ec35915f0901..81ce0f4db555 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -49,15 +49,15 @@
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
.macro LABEL prefix n
-\prefix\n\():
+.L\prefix\n\():
.endm
.macro JMPTBL_ENTRY i
-.quad crc_\i
+.quad .Lcrc_\i
.endm
.macro JNC_LESS_THAN j
- jnc less_than_\j
+ jnc .Lless_than_\j
.endm
# Define threshold where buffers are considered "small" and routed to more
@@ -108,30 +108,30 @@ SYM_FUNC_START(crc_pcl)
neg %bufp
and $7, %bufp # calculate the unalignment amount of
# the address
- je proc_block # Skip if aligned
+ je .Lproc_block # Skip if aligned
## If len is less than 8 and we're unaligned, we need to jump
## to special code to avoid reading beyond the end of the buffer
cmp $8, len
- jae do_align
+ jae .Ldo_align
# less_than_8 expects length in upper 3 bits of len_dw
# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
shl $32-3+1, len_dw
- jmp less_than_8_post_shl1
+ jmp .Lless_than_8_post_shl1
-do_align:
+.Ldo_align:
#### Calculate CRC of unaligned bytes of the buffer (if any)
movq (bufptmp), tmp # load a quadward from the buffer
add %bufp, bufptmp # align buffer pointer for quadword
# processing
sub %bufp, len # update buffer length
-align_loop:
+.Lalign_loop:
crc32b %bl, crc_init_dw # compute crc32 of 1-byte
shr $8, tmp # get next byte
dec %bufp
- jne align_loop
+ jne .Lalign_loop
-proc_block:
+.Lproc_block:
################################################################
## 2) PROCESS BLOCKS:
@@ -141,11 +141,11 @@ proc_block:
movq len, tmp # save num bytes in tmp
cmpq $128*24, len
- jae full_block
+ jae .Lfull_block
-continue_block:
+.Lcontinue_block:
cmpq $SMALL_SIZE, len
- jb small
+ jb .Lsmall
## len < 128*24
movq $2731, %rax # 2731 = ceil(2^16 / 24)
@@ -168,13 +168,14 @@ continue_block:
xor crc2, crc2
## branch into array
- mov jump_table(,%rax,8), %bufp
+ leaq jump_table(%rip), %bufp
+ mov (%bufp,%rax,8), %bufp
JMP_NOSPEC bufp
################################################################
## 2a) PROCESS FULL BLOCKS:
################################################################
-full_block:
+.Lfull_block:
movl $128,%eax
lea 128*8*2(block_0), block_1
lea 128*8*3(block_0), block_2
@@ -189,7 +190,6 @@ full_block:
## 3) CRC Array:
################################################################
-crc_array:
i=128
.rept 128-1
.altmacro
@@ -242,28 +242,28 @@ LABEL crc_ 0
ENDBR
mov tmp, len
cmp $128*24, tmp
- jae full_block
+ jae .Lfull_block
cmp $24, tmp
- jae continue_block
+ jae .Lcontinue_block
-less_than_24:
+.Lless_than_24:
shl $32-4, len_dw # less_than_16 expects length
# in upper 4 bits of len_dw
- jnc less_than_16
+ jnc .Lless_than_16
crc32q (bufptmp), crc_init
crc32q 8(bufptmp), crc_init
- jz do_return
+ jz .Ldo_return
add $16, bufptmp
# len is less than 8 if we got here
# less_than_8 expects length in upper 3 bits of len_dw
# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
shl $2, len_dw
- jmp less_than_8_post_shl1
+ jmp .Lless_than_8_post_shl1
#######################################################################
## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
#######################################################################
-small:
+.Lsmall:
shl $32-8, len_dw # Prepare len_dw for less_than_256
j=256
.rept 5 # j = {256, 128, 64, 32, 16}
@@ -279,32 +279,32 @@ LABEL less_than_ %j # less_than_j: Length should be in
crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
i=i+8
.endr
- jz do_return # Return if remaining length is zero
+ jz .Ldo_return # Return if remaining length is zero
add $j, bufptmp # Advance buf
.endr
-less_than_8: # Length should be stored in
+.Lless_than_8: # Length should be stored in
# upper 3 bits of len_dw
shl $1, len_dw
-less_than_8_post_shl1:
- jnc less_than_4
+.Lless_than_8_post_shl1:
+ jnc .Lless_than_4
crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
- jz do_return # return if remaining data is zero
+ jz .Ldo_return # return if remaining data is zero
add $4, bufptmp
-less_than_4: # Length should be stored in
+.Lless_than_4: # Length should be stored in
# upper 2 bits of len_dw
shl $1, len_dw
- jnc less_than_2
+ jnc .Lless_than_2
crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
- jz do_return # return if remaining data is zero
+ jz .Ldo_return # return if remaining data is zero
add $2, bufptmp
-less_than_2: # Length should be stored in the MSB
+.Lless_than_2: # Length should be stored in the MSB
# of len_dw
shl $1, len_dw
- jnc less_than_1
+ jnc .Lless_than_1
crc32b (bufptmp), crc_init_dw # CRC of 1 byte
-less_than_1: # Length should be zero
-do_return:
+.Lless_than_1: # Length should be zero
+.Ldo_return:
movq crc_init, %rax
popq %rsi
popq %rdi
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index f4c760f4cade..cf21b998e77c 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -129,21 +129,29 @@
movzbl RW0bl, RT2d; \
movzbl RW0bh, RT3d; \
shrq $16, RW0; \
- movq s8(, RT0, 8), RT0; \
- xorq s6(, RT1, 8), to; \
+ leaq s8(%rip), RW1; \
+ movq (RW1, RT0, 8), RT0; \
+ leaq s6(%rip), RW1; \
+ xorq (RW1, RT1, 8), to; \
movzbl RW0bl, RL1d; \
movzbl RW0bh, RT1d; \
shrl $16, RW0d; \
- xorq s4(, RT2, 8), RT0; \
- xorq s2(, RT3, 8), to; \
+ leaq s4(%rip), RW1; \
+ xorq (RW1, RT2, 8), RT0; \
+ leaq s2(%rip), RW1; \
+ xorq (RW1, RT3, 8), to; \
movzbl RW0bl, RT2d; \
movzbl RW0bh, RT3d; \
- xorq s7(, RL1, 8), RT0; \
- xorq s5(, RT1, 8), to; \
- xorq s3(, RT2, 8), RT0; \
+ leaq s7(%rip), RW1; \
+ xorq (RW1, RL1, 8), RT0; \
+ leaq s5(%rip), RW1; \
+ xorq (RW1, RT1, 8), to; \
+ leaq s3(%rip), RW1; \
+ xorq (RW1, RT2, 8), RT0; \
load_next_key(n, RW0); \
xorq RT0, to; \
- xorq s1(, RT3, 8), to; \
+ leaq s1(%rip), RW1; \
+ xorq (RW1, RT3, 8), to; \
#define load_next_key(n, RWx) \
movq (((n) + 1) * 8)(CTX), RWx;
@@ -355,65 +363,89 @@ SYM_FUNC_END(des3_ede_x86_64_crypt_blk)
movzbl RW0bl, RT3d; \
movzbl RW0bh, RT1d; \
shrq $16, RW0; \
- xorq s8(, RT3, 8), to##0; \
- xorq s6(, RT1, 8), to##0; \
+ leaq s8(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##0; \
+ leaq s6(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##0; \
movzbl RW0bl, RT3d; \
movzbl RW0bh, RT1d; \
shrq $16, RW0; \
- xorq s4(, RT3, 8), to##0; \
- xorq s2(, RT1, 8), to##0; \
+ leaq s4(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##0; \
+ leaq s2(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##0; \
movzbl RW0bl, RT3d; \
movzbl RW0bh, RT1d; \
shrl $16, RW0d; \
- xorq s7(, RT3, 8), to##0; \
- xorq s5(, RT1, 8), to##0; \
+ leaq s7(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##0; \
+ leaq s5(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##0; \
movzbl RW0bl, RT3d; \
movzbl RW0bh, RT1d; \
load_next_key(n, RW0); \
- xorq s3(, RT3, 8), to##0; \
- xorq s1(, RT1, 8), to##0; \
+ leaq s3(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##0; \
+ leaq s1(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##0; \
xorq from##1, RW1; \
movzbl RW1bl, RT3d; \
movzbl RW1bh, RT1d; \
shrq $16, RW1; \
- xorq s8(, RT3, 8), to##1; \
- xorq s6(, RT1, 8), to##1; \
+ leaq s8(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##1; \
+ leaq s6(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##1; \
movzbl RW1bl, RT3d; \
movzbl RW1bh, RT1d; \
shrq $16, RW1; \
- xorq s4(, RT3, 8), to##1; \
- xorq s2(, RT1, 8), to##1; \
+ leaq s4(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##1; \
+ leaq s2(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##1; \
movzbl RW1bl, RT3d; \
movzbl RW1bh, RT1d; \
shrl $16, RW1d; \
- xorq s7(, RT3, 8), to##1; \
- xorq s5(, RT1, 8), to##1; \
+ leaq s7(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##1; \
+ leaq s5(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##1; \
movzbl RW1bl, RT3d; \
movzbl RW1bh, RT1d; \
do_movq(RW0, RW1); \
- xorq s3(, RT3, 8), to##1; \
- xorq s1(, RT1, 8), to##1; \
+ leaq s3(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##1; \
+ leaq s1(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##1; \
xorq from##2, RW2; \
movzbl RW2bl, RT3d; \
movzbl RW2bh, RT1d; \
shrq $16, RW2; \
- xorq s8(, RT3, 8), to##2; \
- xorq s6(, RT1, 8), to##2; \
+ leaq s8(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##2; \
+ leaq s6(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##2; \
movzbl RW2bl, RT3d; \
movzbl RW2bh, RT1d; \
shrq $16, RW2; \
- xorq s4(, RT3, 8), to##2; \
- xorq s2(, RT1, 8), to##2; \
+ leaq s4(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##2; \
+ leaq s2(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##2; \
movzbl RW2bl, RT3d; \
movzbl RW2bh, RT1d; \
shrl $16, RW2d; \
- xorq s7(, RT3, 8), to##2; \
- xorq s5(, RT1, 8), to##2; \
+ leaq s7(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##2; \
+ leaq s5(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##2; \
movzbl RW2bl, RT3d; \
movzbl RW2bh, RT1d; \
do_movq(RW0, RW2); \
- xorq s3(, RT3, 8), to##2; \
- xorq s1(, RT1, 8), to##2;
+ leaq s3(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##2; \
+ leaq s1(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##2;
#define __movq(src, dst) \
movq src, dst;
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 257ed9446f3e..99cb983ded9e 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -93,7 +93,7 @@ SYM_FUNC_START(clmul_ghash_mul)
FRAME_BEGIN
movups (%rdi), DATA
movups (%rsi), SHASH
- movaps .Lbswap_mask, BSWAP
+ movaps .Lbswap_mask(%rip), BSWAP
pshufb BSWAP, DATA
call __clmul_gf128mul_ble
pshufb BSWAP, DATA
@@ -110,7 +110,7 @@ SYM_FUNC_START(clmul_ghash_update)
FRAME_BEGIN
cmp $16, %rdx
jb .Lupdate_just_ret # check length
- movaps .Lbswap_mask, BSWAP
+ movaps .Lbswap_mask(%rip), BSWAP
movups (%rdi), DATA
movups (%rcx), SHASH
pshufb BSWAP, DATA
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index a96b2fd26dab..4b49bdc95265 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -485,18 +485,18 @@
xchg WK_BUF, PRECALC_BUF
.align 32
-_loop:
+.L_loop:
/*
* code loops through more than one block
* we use K_BASE value as a signal of a last block,
* it is set below by: cmovae BUFFER_PTR, K_BASE
*/
test BLOCKS_CTR, BLOCKS_CTR
- jnz _begin
+ jnz .L_begin
.align 32
- jmp _end
+ jmp .L_end
.align 32
-_begin:
+.L_begin:
/*
* Do first block
@@ -508,9 +508,6 @@ _begin:
.set j, j+2
.endr
- jmp _loop0
-_loop0:
-
/*
* rounds:
* 10,12,14,16,18
@@ -545,7 +542,7 @@ _loop0:
UPDATE_HASH 16(HASH_PTR), E
test BLOCKS_CTR, BLOCKS_CTR
- jz _loop
+ jz .L_loop
mov TB, B
@@ -562,8 +559,6 @@ _loop0:
.set j, j+2
.endr
- jmp _loop1
-_loop1:
/*
* rounds
* 20+80,22+80,24+80,26+80,28+80
@@ -574,9 +569,6 @@ _loop1:
.set j, j+2
.endr
- jmp _loop2
-_loop2:
-
/*
* rounds
* 40+80,42+80,44+80,46+80,48+80
@@ -592,9 +584,6 @@ _loop2:
/* Move to the next block only if needed*/
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
- jmp _loop3
-_loop3:
-
/*
* rounds
* 60+80,62+80,64+80,66+80,68+80
@@ -623,10 +612,10 @@ _loop3:
xchg WK_BUF, PRECALC_BUF
- jmp _loop
+ jmp .L_loop
.align 32
- _end:
+.L_end:
.endm
/*
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 5555b5d5215a..53de72bdd851 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -360,7 +360,7 @@ SYM_TYPED_FUNC_START(sha256_transform_avx)
and $~15, %rsp # align stack pointer
shl $6, NUM_BLKS # convert to bytes
- jz done_hash
+ jz .Ldone_hash
add INP, NUM_BLKS # pointer to end of data
mov NUM_BLKS, _INP_END(%rsp)
@@ -377,7 +377,7 @@ SYM_TYPED_FUNC_START(sha256_transform_avx)
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
vmovdqa _SHUF_00BA(%rip), SHUF_00BA
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-loop0:
+.Lloop0:
lea K256(%rip), TBL
## byte swap first 16 dwords
@@ -391,7 +391,7 @@ loop0:
## schedule 48 input dwords, by doing 3 rounds of 16 each
mov $3, SRND
.align 16
-loop1:
+.Lloop1:
vpaddd (TBL), X0, XFER
vmovdqa XFER, _XFER(%rsp)
FOUR_ROUNDS_AND_SCHED
@@ -410,10 +410,10 @@ loop1:
FOUR_ROUNDS_AND_SCHED
sub $1, SRND
- jne loop1
+ jne .Lloop1
mov $2, SRND
-loop2:
+.Lloop2:
vpaddd (TBL), X0, XFER
vmovdqa XFER, _XFER(%rsp)
DO_ROUND 0
@@ -433,7 +433,7 @@ loop2:
vmovdqa X3, X1
sub $1, SRND
- jne loop2
+ jne .Lloop2
addm (4*0)(CTX),a
addm (4*1)(CTX),b
@@ -447,9 +447,9 @@ loop2:
mov _INP(%rsp), INP
add $64, INP
cmp _INP_END(%rsp), INP
- jne loop0
+ jne .Lloop0
-done_hash:
+.Ldone_hash:
mov %rbp, %rsp
popq %rbp
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 3eada9416852..9918212faf91 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -538,12 +538,12 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx)
and $-32, %rsp # align rsp to 32 byte boundary
shl $6, NUM_BLKS # convert to bytes
- jz done_hash
+ jz .Ldone_hash
lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
mov NUM_BLKS, _INP_END(%rsp)
cmp NUM_BLKS, INP
- je only_one_block
+ je .Lonly_one_block
## load initial digest
mov (CTX), a
@@ -561,7 +561,7 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx)
mov CTX, _CTX(%rsp)
-loop0:
+.Lloop0:
## Load first 16 dwords from two blocks
VMOVDQ 0*32(INP),XTMP0
VMOVDQ 1*32(INP),XTMP1
@@ -580,7 +580,7 @@ loop0:
vperm2i128 $0x20, XTMP3, XTMP1, X2
vperm2i128 $0x31, XTMP3, XTMP1, X3
-last_block_enter:
+.Llast_block_enter:
add $64, INP
mov INP, _INP(%rsp)
@@ -588,34 +588,40 @@ last_block_enter:
xor SRND, SRND
.align 16
-loop1:
- vpaddd K256+0*32(SRND), X0, XFER
+.Lloop1:
+ leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
+ vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 0*32
- vpaddd K256+1*32(SRND), X0, XFER
+ leaq K256+1*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 1*32
- vpaddd K256+2*32(SRND), X0, XFER
+ leaq K256+2*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 2*32
- vpaddd K256+3*32(SRND), X0, XFER
+ leaq K256+3*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 3*32
add $4*32, SRND
cmp $3*4*32, SRND
- jb loop1
+ jb .Lloop1
-loop2:
+.Lloop2:
## Do last 16 rounds with no scheduling
- vpaddd K256+0*32(SRND), X0, XFER
+ leaq K256+0*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
DO_4ROUNDS _XFER + 0*32
- vpaddd K256+1*32(SRND), X1, XFER
+ leaq K256+1*32(%rip), INP
+ vpaddd (INP, SRND), X1, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
DO_4ROUNDS _XFER + 1*32
add $2*32, SRND
@@ -624,7 +630,7 @@ loop2:
vmovdqa X3, X1
cmp $4*4*32, SRND
- jb loop2
+ jb .Lloop2
mov _CTX(%rsp), CTX
mov _INP(%rsp), INP
@@ -639,17 +645,17 @@ loop2:
addm (4*7)(CTX),h
cmp _INP_END(%rsp), INP
- ja done_hash
+ ja .Ldone_hash
#### Do second block using previously scheduled results
xor SRND, SRND
.align 16
-loop3:
+.Lloop3:
DO_4ROUNDS _XFER + 0*32 + 16
DO_4ROUNDS _XFER + 1*32 + 16
add $2*32, SRND
cmp $4*4*32, SRND
- jb loop3
+ jb .Lloop3
mov _CTX(%rsp), CTX
mov _INP(%rsp), INP
@@ -665,10 +671,10 @@ loop3:
addm (4*7)(CTX),h
cmp _INP_END(%rsp), INP
- jb loop0
- ja done_hash
+ jb .Lloop0
+ ja .Ldone_hash
-do_last_block:
+.Ldo_last_block:
VMOVDQ 0*16(INP),XWORD0
VMOVDQ 1*16(INP),XWORD1
VMOVDQ 2*16(INP),XWORD2
@@ -679,9 +685,9 @@ do_last_block:
vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
- jmp last_block_enter
+ jmp .Llast_block_enter
-only_one_block:
+.Lonly_one_block:
## load initial digest
mov (4*0)(CTX),a
@@ -698,9 +704,9 @@ only_one_block:
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
mov CTX, _CTX(%rsp)
- jmp do_last_block
+ jmp .Ldo_last_block
-done_hash:
+.Ldone_hash:
mov %rbp, %rsp
pop %rbp
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index 959288eecc68..93264ee44543 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -369,7 +369,7 @@ SYM_TYPED_FUNC_START(sha256_transform_ssse3)
and $~15, %rsp
shl $6, NUM_BLKS # convert to bytes
- jz done_hash
+ jz .Ldone_hash
add INP, NUM_BLKS
mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
@@ -387,7 +387,7 @@ SYM_TYPED_FUNC_START(sha256_transform_ssse3)
movdqa _SHUF_00BA(%rip), SHUF_00BA
movdqa _SHUF_DC00(%rip), SHUF_DC00
-loop0:
+.Lloop0:
lea K256(%rip), TBL
## byte swap first 16 dwords
@@ -401,7 +401,7 @@ loop0:
## schedule 48 input dwords, by doing 3 rounds of 16 each
mov $3, SRND
.align 16
-loop1:
+.Lloop1:
movdqa (TBL), XFER
paddd X0, XFER
movdqa XFER, _XFER(%rsp)
@@ -424,10 +424,10 @@ loop1:
FOUR_ROUNDS_AND_SCHED
sub $1, SRND
- jne loop1
+ jne .Lloop1
mov $2, SRND
-loop2:
+.Lloop2:
paddd (TBL), X0
movdqa X0, _XFER(%rsp)
DO_ROUND 0
@@ -446,7 +446,7 @@ loop2:
movdqa X3, X1
sub $1, SRND
- jne loop2
+ jne .Lloop2
addm (4*0)(CTX),a
addm (4*1)(CTX),b
@@ -460,9 +460,9 @@ loop2:
mov _INP(%rsp), INP
add $64, INP
cmp _INP_END(%rsp), INP
- jne loop0
+ jne .Lloop0
-done_hash:
+.Ldone_hash:
mov %rbp, %rsp
popq %rbp
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index b0984f19fdb4..d902b8ea0721 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -276,7 +276,7 @@ frame_size = frame_WK + WK_SIZE
########################################################################
SYM_TYPED_FUNC_START(sha512_transform_avx)
test msglen, msglen
- je nowork
+ je .Lnowork
# Save GPRs
push %rbx
@@ -291,7 +291,7 @@ SYM_TYPED_FUNC_START(sha512_transform_avx)
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
-updateblock:
+.Lupdateblock:
# Load state variables
mov DIGEST(0), a_64
@@ -348,7 +348,7 @@ updateblock:
# Advance to next message block
add $16*8, msg
dec msglen
- jnz updateblock
+ jnz .Lupdateblock
# Restore Stack Pointer
mov %rbp, %rsp
@@ -361,7 +361,7 @@ updateblock:
pop %r12
pop %rbx
-nowork:
+.Lnowork:
RET
SYM_FUNC_END(sha512_transform_avx)
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index b1ca99055ef9..f08496cd6870 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -581,7 +581,7 @@ SYM_TYPED_FUNC_START(sha512_transform_rorx)
and $~(0x20 - 1), %rsp
shl $7, NUM_BLKS # convert to bytes
- jz done_hash
+ jz .Ldone_hash
add INP, NUM_BLKS # pointer to end of data
mov NUM_BLKS, frame_INPEND(%rsp)
@@ -600,7 +600,7 @@ SYM_TYPED_FUNC_START(sha512_transform_rorx)
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
-loop0:
+.Lloop0:
lea K512(%rip), TBL
## byte swap first 16 dwords
@@ -615,7 +615,7 @@ loop0:
movq $4, frame_SRND(%rsp)
.align 16
-loop1:
+.Lloop1:
vpaddq (TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
FOUR_ROUNDS_AND_SCHED
@@ -634,10 +634,10 @@ loop1:
FOUR_ROUNDS_AND_SCHED
subq $1, frame_SRND(%rsp)
- jne loop1
+ jne .Lloop1
movq $2, frame_SRND(%rsp)
-loop2:
+.Lloop2:
vpaddq (TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
DO_4ROUNDS
@@ -650,7 +650,7 @@ loop2:
vmovdqa Y_3, Y_1
subq $1, frame_SRND(%rsp)
- jne loop2
+ jne .Lloop2
mov frame_CTX(%rsp), CTX2
addm 8*0(CTX2), a
@@ -665,9 +665,9 @@ loop2:
mov frame_INP(%rsp), INP
add $128, INP
cmp frame_INPEND(%rsp), INP
- jne loop0
+ jne .Lloop0
-done_hash:
+.Ldone_hash:
# Restore Stack Pointer
mov %rbp, %rsp
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index c06afb5270e5..65be30156816 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -278,7 +278,7 @@ frame_size = frame_WK + WK_SIZE
SYM_TYPED_FUNC_START(sha512_transform_ssse3)
test msglen, msglen
- je nowork
+ je .Lnowork
# Save GPRs
push %rbx
@@ -293,7 +293,7 @@ SYM_TYPED_FUNC_START(sha512_transform_ssse3)
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
-updateblock:
+.Lupdateblock:
# Load state variables
mov DIGEST(0), a_64
@@ -350,7 +350,7 @@ updateblock:
# Advance to next message block
add $16*8, msg
dec msglen
- jnz updateblock
+ jnz .Lupdateblock
# Restore Stack Pointer
mov %rbp, %rsp
@@ -363,7 +363,7 @@ updateblock:
pop %r12
pop %rbx
-nowork:
+.Lnowork:
RET
SYM_FUNC_END(sha512_transform_ssse3)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 52398d49bc2f..69ae5e1b3120 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -12,6 +12,7 @@
#include <linux/kvm_host.h>
#include <linux/kernel.h>
#include <linux/highmem.h>
+#include <linux/psp.h>
#include <linux/psp-sev.h>
#include <linux/pagemap.h>
#include <linux/swap.h>