diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-02-21 17:23:56 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-02-21 17:23:56 -0800 |
commit | 31caf8b2a847214be856f843e251fc2ed2cd1075 (patch) | |
tree | 245257387cfcae352fae27b1ca824daab55472ba /arch/arm64 | |
parent | a2b095e0efa7229a1a88602283ba1a8a32004851 (diff) | |
parent | 0de9dc80625b0ca1cb9730c5ed1c5a8cab538369 (diff) |
Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu:
"API:
- Restrict crypto_cipher to internal API users only.
Algorithms:
- Add x86 aesni acceleration for cts.
- Improve x86 aesni acceleration for xts.
- Remove x86 acceleration of some uncommon algorithms.
- Remove RIPE-MD, Tiger and Salsa20.
- Remove tnepres.
- Add ARM acceleration for BLAKE2s and BLAKE2b.
Drivers:
- Add Keem Bay OCS HCU driver.
- Add Marvell OcteonTX2 CPT PF driver.
- Remove PicoXcell driver.
- Remove mediatek driver"
* 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (154 commits)
hwrng: timeriomem - Use device-managed registration API
crypto: hisilicon/qm - fix printing format issue
crypto: hisilicon/qm - do not reset hardware when CE happens
crypto: hisilicon/qm - update irqflag
crypto: hisilicon/qm - fix the value of 'QM_SQC_VFT_BASE_MASK_V2'
crypto: hisilicon/qm - fix request missing error
crypto: hisilicon/qm - removing driver after reset
crypto: octeontx2 - fix -Wpointer-bool-conversion warning
crypto: hisilicon/hpre - enable Elliptic curve cryptography
crypto: hisilicon - PASID fixed on Kunpeng 930
crypto: hisilicon/qm - fix use of 'dma_map_single'
crypto: hisilicon/hpre - tiny fix
crypto: hisilicon/hpre - adapt the number of clusters
crypto: cpt - remove casting dma_alloc_coherent
crypto: keembay-ocs-aes - Fix 'q' assignment during CCM B0 generation
crypto: xor - Fix typo of optimization
hwrng: optee - Use device-managed registration API
crypto: arm64/crc-t10dif - move NEON yield to C code
crypto: arm64/aes-ce-mac - simplify NEON yield
crypto: arm64/aes-neonbs - remove NEON yield calls
...
Diffstat (limited to 'arch/arm64')
-rw-r--r-- | arch/arm64/crypto/aes-glue.c | 71 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-modes.S | 217 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-neonbs-core.S | 8 | ||||
-rw-r--r-- | arch/arm64/crypto/crct10dif-ce-core.S | 43 | ||||
-rw-r--r-- | arch/arm64/crypto/crct10dif-ce-glue.c | 30 | ||||
-rw-r--r-- | arch/arm64/crypto/sha1-ce-core.S | 47 | ||||
-rw-r--r-- | arch/arm64/crypto/sha1-ce-glue.c | 23 | ||||
-rw-r--r-- | arch/arm64/crypto/sha2-ce-core.S | 38 | ||||
-rw-r--r-- | arch/arm64/crypto/sha2-ce-glue.c | 24 | ||||
-rw-r--r-- | arch/arm64/crypto/sha3-ce-core.S | 81 | ||||
-rw-r--r-- | arch/arm64/crypto/sha3-ce-glue.c | 18 | ||||
-rw-r--r-- | arch/arm64/crypto/sha512-ce-core.S | 29 | ||||
-rw-r--r-- | arch/arm64/crypto/sha512-ce-glue.c | 55 |
13 files changed, 345 insertions, 339 deletions
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 34b8a89197be..17e735931a0c 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -24,6 +24,7 @@ #ifdef USE_V8_CRYPTO_EXTENSIONS #define MODE "ce" #define PRIO 300 +#define STRIDE 5 #define aes_expandkey ce_aes_expandkey #define aes_ecb_encrypt ce_aes_ecb_encrypt #define aes_ecb_decrypt ce_aes_ecb_decrypt @@ -41,6 +42,7 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); #else #define MODE "neon" #define PRIO 200 +#define STRIDE 4 #define aes_ecb_encrypt neon_aes_ecb_encrypt #define aes_ecb_decrypt neon_aes_ecb_decrypt #define aes_cbc_encrypt neon_aes_cbc_encrypt @@ -55,7 +57,7 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); #define aes_mac_update neon_aes_mac_update MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON"); #endif -#if defined(USE_V8_CRYPTO_EXTENSIONS) || !defined(CONFIG_CRYPTO_AES_ARM64_BS) +#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS) MODULE_ALIAS_CRYPTO("ecb(aes)"); MODULE_ALIAS_CRYPTO("cbc(aes)"); MODULE_ALIAS_CRYPTO("ctr(aes)"); @@ -87,7 +89,7 @@ asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, int bytes, u8 const iv[]); asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], - int rounds, int blocks, u8 ctr[]); + int rounds, int bytes, u8 ctr[], u8 finalbuf[]); asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, int bytes, u32 const rk2[], u8 iv[], @@ -103,9 +105,9 @@ asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, int blocks, u8 iv[], u32 const rk2[]); -asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds, - int blocks, u8 dg[], int enc_before, - int enc_after); +asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds, + int blocks, u8 dg[], int enc_before, + int enc_after); struct crypto_aes_xts_ctx { struct crypto_aes_ctx key1; @@ -448,34 +450,36 @@ static int ctr_encrypt(struct skcipher_request *req) struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); int err, rounds = 6 + ctx->key_length / 4; struct skcipher_walk walk; - int blocks; err = skcipher_walk_virt(&walk, req, false); - while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { - kernel_neon_begin(); - aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, - ctx->key_enc, rounds, blocks, walk.iv); - kernel_neon_end(); - err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); - } - if (walk.nbytes) { - u8 __aligned(8) tail[AES_BLOCK_SIZE]; + while (walk.nbytes > 0) { + const u8 *src = walk.src.virt.addr; unsigned int nbytes = walk.nbytes; - u8 *tdst = walk.dst.virt.addr; - u8 *tsrc = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + u8 buf[AES_BLOCK_SIZE]; + unsigned int tail; - /* - * Tell aes_ctr_encrypt() to process a tail block. - */ - blocks = -1; + if (unlikely(nbytes < AES_BLOCK_SIZE)) + src = memcpy(buf, src, nbytes); + else if (nbytes < walk.total) + nbytes &= ~(AES_BLOCK_SIZE - 1); kernel_neon_begin(); - aes_ctr_encrypt(tail, NULL, ctx->key_enc, rounds, - blocks, walk.iv); + aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes, + walk.iv, buf); kernel_neon_end(); - crypto_xor_cpy(tdst, tsrc, tail, nbytes); - err = skcipher_walk_done(&walk, 0); + + tail = nbytes % (STRIDE * AES_BLOCK_SIZE); + if (tail > 0 && tail < AES_BLOCK_SIZE) + /* + * The final partial block could not be returned using + * an overlapping store, so it was passed via buf[] + * instead. + */ + memcpy(dst + nbytes - tail, buf, tail); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; @@ -650,7 +654,7 @@ static int __maybe_unused xts_decrypt(struct skcipher_request *req) } static struct skcipher_alg aes_algs[] = { { -#if defined(USE_V8_CRYPTO_EXTENSIONS) || !defined(CONFIG_CRYPTO_AES_ARM64_BS) +#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS) .base = { .cra_name = "__ecb(aes)", .cra_driver_name = "__ecb-aes-" MODE, @@ -852,10 +856,17 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks, int rounds = 6 + ctx->key_length / 4; if (crypto_simd_usable()) { - kernel_neon_begin(); - aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before, - enc_after); - kernel_neon_end(); + int rem; + + do { + kernel_neon_begin(); + rem = aes_mac_update(in, ctx->key_enc, rounds, blocks, + dg, enc_before, enc_after); + kernel_neon_end(); + in += (blocks - rem) * AES_BLOCK_SIZE; + blocks = rem; + enc_before = 0; + } while (blocks); } else { if (enc_before) aes_encrypt(ctx, dg, dg); diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index cf618d8f6cec..bbdb54702aa7 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -321,42 +321,76 @@ AES_FUNC_END(aes_cbc_cts_decrypt) /* * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int blocks, u8 ctr[]) + * int bytes, u8 ctr[], u8 finalbuf[]) */ AES_FUNC_START(aes_ctr_encrypt) stp x29, x30, [sp, #-16]! mov x29, sp - enc_prepare w3, x2, x6 + enc_prepare w3, x2, x12 ld1 {vctr.16b}, [x5] - umov x6, vctr.d[1] /* keep swabbed ctr in reg */ - rev x6, x6 - cmn w6, w4 /* 32 bit overflow? */ - bcs .Lctrloop + umov x12, vctr.d[1] /* keep swabbed ctr in reg */ + rev x12, x12 + .LctrloopNx: - subs w4, w4, #MAX_STRIDE - bmi .Lctr1x - add w7, w6, #1 + add w7, w4, #15 + sub w4, w4, #MAX_STRIDE << 4 + lsr w7, w7, #4 + mov w8, #MAX_STRIDE + cmp w7, w8 + csel w7, w7, w8, lt + adds x12, x12, x7 + mov v0.16b, vctr.16b - add w8, w6, #2 mov v1.16b, vctr.16b - add w9, w6, #3 mov v2.16b, vctr.16b - add w9, w6, #3 - rev w7, w7 mov v3.16b, vctr.16b - rev w8, w8 ST5( mov v4.16b, vctr.16b ) - mov v1.s[3], w7 - rev w9, w9 -ST5( add w10, w6, #4 ) - mov v2.s[3], w8 -ST5( rev w10, w10 ) - mov v3.s[3], w9 -ST5( mov v4.s[3], w10 ) - ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ + bcs 0f + + .subsection 1 + /* apply carry to outgoing counter */ +0: umov x8, vctr.d[0] + rev x8, x8 + add x8, x8, #1 + rev x8, x8 + ins vctr.d[0], x8 + + /* apply carry to N counter blocks for N := x12 */ + adr x16, 1f + sub x16, x16, x12, lsl #3 + br x16 + hint 34 // bti c + mov v0.d[0], vctr.d[0] + hint 34 // bti c + mov v1.d[0], vctr.d[0] + hint 34 // bti c + mov v2.d[0], vctr.d[0] + hint 34 // bti c + mov v3.d[0], vctr.d[0] +ST5( hint 34 ) +ST5( mov v4.d[0], vctr.d[0] ) +1: b 2f + .previous + +2: rev x7, x12 + ins vctr.d[1], x7 + sub x7, x12, #MAX_STRIDE - 1 + sub x8, x12, #MAX_STRIDE - 2 + sub x9, x12, #MAX_STRIDE - 3 + rev x7, x7 + rev x8, x8 + mov v1.d[1], x7 + rev x9, x9 +ST5( sub x10, x12, #MAX_STRIDE - 4 ) + mov v2.d[1], x8 +ST5( rev x10, x10 ) + mov v3.d[1], x9 +ST5( mov v4.d[1], x10 ) + tbnz w4, #31, .Lctrtail + ld1 {v5.16b-v7.16b}, [x1], #48 ST4( bl aes_encrypt_block4x ) ST5( bl aes_encrypt_block5x ) eor v0.16b, v5.16b, v0.16b @@ -368,47 +402,72 @@ ST5( ld1 {v5.16b-v6.16b}, [x1], #32 ) ST5( eor v4.16b, v6.16b, v4.16b ) st1 {v0.16b-v3.16b}, [x0], #64 ST5( st1 {v4.16b}, [x0], #16 ) - add x6, x6, #MAX_STRIDE - rev x7, x6 - ins vctr.d[1], x7 cbz w4, .Lctrout b .LctrloopNx -.Lctr1x: - adds w4, w4, #MAX_STRIDE - beq .Lctrout -.Lctrloop: - mov v0.16b, vctr.16b - encrypt_block v0, w3, x2, x8, w7 - - adds x6, x6, #1 /* increment BE ctr */ - rev x7, x6 - ins vctr.d[1], x7 - bcs .Lctrcarry /* overflow? */ - -.Lctrcarrydone: - subs w4, w4, #1 - bmi .Lctrtailblock /* blocks <0 means tail block */ - ld1 {v3.16b}, [x1], #16 - eor v3.16b, v0.16b, v3.16b - st1 {v3.16b}, [x0], #16 - bne .Lctrloop .Lctrout: st1 {vctr.16b}, [x5] /* return next CTR value */ ldp x29, x30, [sp], #16 ret -.Lctrtailblock: - st1 {v0.16b}, [x0] +.Lctrtail: + /* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */ + mov x16, #16 + ands x13, x4, #0xf + csel x13, x13, x16, ne + +ST5( cmp w4, #64 - (MAX_STRIDE << 4) ) +ST5( csel x14, x16, xzr, gt ) + cmp w4, #48 - (MAX_STRIDE << 4) + csel x15, x16, xzr, gt + cmp w4, #32 - (MAX_STRIDE << 4) + csel x16, x16, xzr, gt + cmp w4, #16 - (MAX_STRIDE << 4) + ble .Lctrtail1x + + adr_l x12, .Lcts_permute_table + add x12, x12, x13 + +ST5( ld1 {v5.16b}, [x1], x14 ) + ld1 {v6.16b}, [x1], x15 + ld1 {v7.16b}, [x1], x16 + +ST4( bl aes_encrypt_block4x ) +ST5( bl aes_encrypt_block5x ) + + ld1 {v8.16b}, [x1], x13 + ld1 {v9.16b}, [x1] + ld1 {v10.16b}, [x12] + +ST4( eor v6.16b, v6.16b, v0.16b ) +ST4( eor v7.16b, v7.16b, v1.16b ) +ST4( tbl v3.16b, {v3.16b}, v10.16b ) +ST4( eor v8.16b, v8.16b, v2.16b ) +ST4( eor v9.16b, v9.16b, v3.16b ) + +ST5( eor v5.16b, v5.16b, v0.16b ) +ST5( eor v6.16b, v6.16b, v1.16b ) +ST5( tbl v4.16b, {v4.16b}, v10.16b ) +ST5( eor v7.16b, v7.16b, v2.16b ) +ST5( eor v8.16b, v8.16b, v3.16b ) +ST5( eor v9.16b, v9.16b, v4.16b ) + +ST5( st1 {v5.16b}, [x0], x14 ) + st1 {v6.16b}, [x0], x15 + st1 {v7.16b}, [x0], x16 + add x13, x13, x0 + st1 {v9.16b}, [x13] // overlapping stores + st1 {v8.16b}, [x0] b .Lctrout -.Lctrcarry: - umov x7, vctr.d[0] /* load upper word of ctr */ - rev x7, x7 /* ... to handle the carry */ - add x7, x7, #1 - rev x7, x7 - ins vctr.d[0], x7 - b .Lctrcarrydone +.Lctrtail1x: + csel x0, x0, x6, eq // use finalbuf if less than a full block + ld1 {v5.16b}, [x1] +ST5( mov v3.16b, v4.16b ) + encrypt_block v3, w3, x2, x8, w7 + eor v5.16b, v5.16b, v3.16b + st1 {v5.16b}, [x0] + b .Lctrout AES_FUNC_END(aes_ctr_encrypt) @@ -619,61 +678,47 @@ AES_FUNC_END(aes_xts_decrypt) * int blocks, u8 dg[], int enc_before, int enc_after) */ AES_FUNC_START(aes_mac_update) - frame_push 6 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x6 - - ld1 {v0.16b}, [x23] /* get dg */ + ld1 {v0.16b}, [x4] /* get dg */ enc_prepare w2, x1, x7 cbz w5, .Lmacloop4x encrypt_block v0, w2, x1, x7, w8 .Lmacloop4x: - subs w22, w22, #4 + subs w3, w3, #4 bmi .Lmac1x - ld1 {v1.16b-v4.16b}, [x19], #64 /* get next pt block */ + ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ - encrypt_block v0, w21, x20, x7, w8 + encrypt_block v0, w2, x1, x7, w8 eor v0.16b, v0.16b, v2.16b - encrypt_block v0, w21, x20, x7, w8 + encrypt_block v0, w2, x1, x7, w8 eor v0.16b, v0.16b, v3.16b - encrypt_block v0, w21, x20, x7, w8 + encrypt_block v0, w2, x1, x7, w8 eor v0.16b, v0.16b, v4.16b - cmp w22, wzr - csinv x5, x24, xzr, eq + cmp w3, wzr + csinv x5, x6, xzr, eq cbz w5, .Lmacout - encrypt_block v0, w21, x20, x7, w8 - st1 {v0.16b}, [x23] /* return dg */ - cond_yield_neon .Lmacrestart + encrypt_block v0, w2, x1, x7, w8 + st1 {v0.16b}, [x4] /* return dg */ + cond_yield .Lmacout, x7 b .Lmacloop4x .Lmac1x: - add w22, w22, #4 + add w3, w3, #4 .Lmacloop: - cbz w22, .Lmacout - ld1 {v1.16b}, [x19], #16 /* get next pt block */ + cbz w3, .Lmacout + ld1 {v1.16b}, [x0], #16 /* get next pt block */ eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ - subs w22, w22, #1 - csinv x5, x24, xzr, eq + subs w3, w3, #1 + csinv x5, x6, xzr, eq cbz w5, .Lmacout .Lmacenc: - encrypt_block v0, w21, x20, x7, w8 + encrypt_block v0, w2, x1, x7, w8 b .Lmacloop .Lmacout: - st1 {v0.16b}, [x23] /* return dg */ - frame_pop + st1 {v0.16b}, [x4] /* return dg */ + mov w0, w3 ret - -.Lmacrestart: - ld1 {v0.16b}, [x23] /* get dg */ - enc_prepare w21, x20, x0 - b .Lmacloop4x AES_FUNC_END(aes_mac_update) diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S index 63a52ad9a75c..a3405b8c344b 100644 --- a/arch/arm64/crypto/aes-neonbs-core.S +++ b/arch/arm64/crypto/aes-neonbs-core.S @@ -613,7 +613,6 @@ SYM_FUNC_END(aesbs_decrypt8) st1 {\o7\().16b}, [x19], #16 cbz x23, 1f - cond_yield_neon b 99b 1: frame_pop @@ -715,7 +714,6 @@ SYM_FUNC_START(aesbs_cbc_decrypt) 1: st1 {v24.16b}, [x24] // store IV cbz x23, 2f - cond_yield_neon b 99b 2: frame_pop @@ -801,7 +799,7 @@ SYM_FUNC_END(__xts_crypt8) mov x23, x4 mov x24, x5 -0: movi v30.2s, #0x1 + movi v30.2s, #0x1 movi v25.2s, #0x87 uzp1 v30.4s, v30.4s, v25.4s ld1 {v25.16b}, [x24] @@ -846,7 +844,6 @@ SYM_FUNC_END(__xts_crypt8) cbz x23, 1f st1 {v25.16b}, [x24] - cond_yield_neon 0b b 99b 1: st1 {v25.16b}, [x24] @@ -889,7 +886,7 @@ SYM_FUNC_START(aesbs_ctr_encrypt) cset x26, ne add x23, x23, x26 // do one extra block if final -98: ldp x7, x8, [x24] + ldp x7, x8, [x24] ld1 {v0.16b}, [x24] CPU_LE( rev x7, x7 ) CPU_LE( rev x8, x8 ) @@ -967,7 +964,6 @@ CPU_LE( rev x8, x8 ) st1 {v0.16b}, [x24] cbz x23, .Lctr_done - cond_yield_neon 98b b 99b .Lctr_done: diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S index 111d9c9abddd..dce6dcebfca1 100644 --- a/arch/arm64/crypto/crct10dif-ce-core.S +++ b/arch/arm64/crypto/crct10dif-ce-core.S @@ -68,10 +68,10 @@ .text .arch armv8-a+crypto - init_crc .req w19 - buf .req x20 - len .req x21 - fold_consts_ptr .req x22 + init_crc .req w0 + buf .req x1 + len .req x2 + fold_consts_ptr .req x3 fold_consts .req v10 @@ -257,12 +257,6 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) .endm .macro crc_t10dif_pmull, p - frame_push 4, 128 - - mov init_crc, w0 - mov buf, x1 - mov len, x2 - __pmull_init_\p // For sizes less than 256 bytes, we can't fold 128 bytes at a time. @@ -317,26 +311,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) fold_32_bytes \p, v6, v7 subs len, len, #128 - b.lt .Lfold_128_bytes_loop_done_\@ - - if_will_cond_yield_neon - stp q0, q1, [sp, #.Lframe_local_offset] - stp q2, q3, [sp, #.Lframe_local_offset + 32] - stp q4, q5, [sp, #.Lframe_local_offset + 64] - stp q6, q7, [sp, #.Lframe_local_offset + 96] - do_cond_yield_neon - ldp q0, q1, [sp, #.Lframe_local_offset] - ldp q2, q3, [sp, #.Lframe_local_offset + 32] - ldp q4, q5, [sp, #.Lframe_local_offset + 64] - ldp q6, q7, [sp, #.Lframe_local_offset + 96] - ld1 {fold_consts.2d}, [fold_consts_ptr] - __pmull_init_\p - __pmull_pre_\p fold_consts - endif_yield_neon - - b .Lfold_128_bytes_loop_\@ - -.Lfold_128_bytes_loop_done_\@: + b.ge .Lfold_128_bytes_loop_\@ // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. @@ -453,7 +428,9 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. umov w0, v0.h[0] - frame_pop + .ifc \p, p8 + ldp x29, x30, [sp], #16 + .endif ret .Lless_than_256_bytes_\@: @@ -489,7 +466,9 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) // Assumes len >= 16. // SYM_FUNC_START(crc_t10dif_pmull_p8) - crc_t10dif_pmull p8 + stp x29, x30, [sp, #-16]! + mov x29, sp + crc_t10dif_pmull p8 SYM_FUNC_END(crc_t10dif_pmull_p8) .align 5 diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c index ccc3f6067742..09eb1456aed4 100644 --- a/arch/arm64/crypto/crct10dif-ce-glue.c +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -37,9 +37,18 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data, u16 *crc = shash_desc_ctx(desc); if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { - kernel_neon_begin(); - *crc = crc_t10dif_pmull_p8(*crc, data, length); - kernel_neon_end(); + do { + unsigned int chunk = length; + + if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE) + chunk = SZ_4K; + + kernel_neon_begin(); + *crc = crc_t10dif_pmull_p8(*crc, data, chunk); + kernel_neon_end(); + data += chunk; + length -= chunk; + } while (length); } else { *crc = crc_t10dif_generic(*crc, data, length); } @@ -53,9 +62,18 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data, u16 *crc = shash_desc_ctx(desc); if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { - kernel_neon_begin(); - *crc = crc_t10dif_pmull_p64(*crc, data, length); - kernel_neon_end(); + do { + unsigned int chunk = length; + + if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE) + chunk = SZ_4K; + + kernel_neon_begin(); + *crc = crc_t10dif_pmull_p64(*crc, data, chunk); + kernel_neon_end(); + data += chunk; + length -= chunk; + } while (length); } else { *crc = crc_t10dif_generic(*crc, data, length); } diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S index 92d0d2753e81..8c02bbc2684e 100644 --- a/arch/arm64/crypto/sha1-ce-core.S +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -62,40 +62,34 @@ .endm /* - * void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, - * int blocks) + * int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, + * int blocks) */ SYM_FUNC_START(sha1_ce_transform) - frame_push 3 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - /* load round constants */ -0: loadrc k0.4s, 0x5a827999, w6 + loadrc k0.4s, 0x5a827999, w6 loadrc k1.4s, 0x6ed9eba1, w6 loadrc k2.4s, 0x8f1bbcdc, w6 loadrc k3.4s, 0xca62c1d6, w6 /* load state */ - ld1 {dgav.4s}, [x19] - ldr dgb, [x19, #16] + ld1 {dgav.4s}, [x0] + ldr dgb, [x0, #16] /* load sha1_ce_state::finalize */ ldr_l w4, sha1_ce_offsetof_finalize, x4 - ldr w4, [x19, x4] + ldr w4, [x0, x4] /* load input */ -1: ld1 {v8.4s-v11.4s}, [x20], #64 - sub w21, w21, #1 +0: ld1 {v8.4s-v11.4s}, [x1], #64 + sub w2, w2, #1 CPU_LE( rev32 v8.16b, v8.16b ) CPU_LE( rev32 v9.16b, v9.16b ) CPU_LE( rev32 v10.16b, v10.16b ) CPU_LE( rev32 v11.16b, v11.16b ) -2: add t0.4s, v8.4s, k0.4s +1: add t0.4s, v8.4s, k0.4s mov dg0v.16b, dgav.16b add_update c, ev, k0, 8, 9, 10, 11, dgb @@ -126,25 +120,18 @@ CPU_LE( rev32 v11.16b, v11.16b ) add dgbv.2s, dgbv.2s, dg1v.2s add dgav.4s, dgav.4s, dg0v.4s - cbz w21, 3f - - if_will_cond_yield_neon - st1 {dgav.4s}, [x19] - str dgb, [x19, #16] - do_cond_yield_neon + cbz w2, 2f + cond_yield 3f, x5 b 0b - endif_yield_neon - - b 1b /* * Final block: add padding and total bit count. * Skip if the input size was not a round multiple of the block size, * the padding is handled by the C code in that case. */ -3: cbz x4, 4f +2: cbz x4, 3f ldr_l w4, sha1_ce_offsetof_count, x4 - ldr x4, [x19, x4] + ldr x4, [x0, x4] movi v9.2d, #0 mov x8, #0x80000000 movi v10.2d, #0 @@ -153,11 +140,11 @@ CPU_LE( rev32 v11.16b, v11.16b ) mov x4, #0 mov v11.d[0], xzr mov v11.d[1], x7 - b 2b + b 1b /* store new state */ -4: st1 {dgav.4s}, [x19] - str dgb, [x19, #16] - frame_pop +3: st1 {dgav.4s}, [x0] + str dgb, [x0, #16] + mov w0, w2 ret SYM_FUNC_END(sha1_ce_transform) diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index c93121bcfdeb..71fa4f1122d7 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -19,6 +19,7 @@ MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha1"); struct sha1_ce_state { struct sha1_state sst; @@ -28,14 +29,22 @@ struct sha1_ce_state { extern const u32 sha1_ce_offsetof_count; extern const u32 sha1_ce_offsetof_finalize; -asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, - int blocks); +asmlinkage int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, + int blocks); static void __sha1_ce_transform(struct sha1_state *sst, u8 const *src, int blocks) { - sha1_ce_transform(container_of(sst, struct sha1_ce_state, sst), src, - blocks); + while (blocks) { + int rem; + + kernel_neon_begin(); + rem = sha1_ce_transform(container_of(sst, struct sha1_ce_state, + sst), src, blocks); + kernel_neon_end(); + src += (blocks - rem) * SHA1_BLOCK_SIZE; + blocks = rem; + } } const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count); @@ -50,9 +59,7 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data, return crypto_sha1_update(desc, data, len); sctx->finalize = 0; - kernel_neon_begin(); sha1_base_do_update(desc, data, len, __sha1_ce_transform); - kernel_neon_end(); return 0; } @@ -72,11 +79,9 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, */ sctx->finalize = finalize; - kernel_neon_begin(); sha1_base_do_update(desc, data, len, __sha1_ce_transform); if (!finalize) sha1_base_do_finalize(desc, __sha1_ce_transform); - kernel_neon_end(); return sha1_base_finish(desc, out); } @@ -88,9 +93,7 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out) return crypto_sha1_finup(desc, NULL, 0, out); sctx->finalize = 0; - kernel_neon_begin(); sha1_base_do_finalize(desc, __sha1_ce_transform); - kernel_neon_end(); return sha1_base_finish(desc, out); } diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S index 3f9d0f326987..6cdea7d56059 100644 --- a/arch/arm64/crypto/sha2-ce-core.S +++ b/arch/arm64/crypto/sha2-ce-core.S @@ -76,36 +76,30 @@ */ .text SYM_FUNC_START(sha2_ce_transform) - frame_push 3 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - /* load round constants */ -0: adr_l x8, .Lsha2_rcon + adr_l x8, .Lsha2_rcon ld1 { v0.4s- v3.4s}, [x8], #64 ld1 { v4.4s- v7.4s}, [x8], #64 ld1 { v8.4s-v11.4s}, [x8], #64 ld1 {v12.4s-v15.4s}, [x8] /* load state */ - ld1 {dgav.4s, dgbv.4s}, [x19] + ld1 {dgav.4s, dgbv.4s}, [x0] /* load sha256_ce_state::finalize */ ldr_l w4, sha256_ce_offsetof_finalize, x4 - ldr w4, [x19, x4] + ldr w4, [x0, x4] /* load input */ -1: ld1 {v16.4s-v19.4s}, [x20], #64 - sub w21, w21, #1 +0: ld1 {v16.4s-v19.4s}, [x1], #64 + sub w2, w2, #1 CPU_LE( rev32 v16.16b, v16.16b ) CPU_LE( rev32 v17.16b, v17.16b ) CPU_LE( rev32 v18.16b, v18.16b ) CPU_LE( rev32 v19.16b, v19.16b ) -2: add t0.4s, v16.4s, v0.4s +1: add t0.4s, v16.4s, v0.4s mov dg0v.16b, dgav.16b mov dg1v.16b, dgbv.16b @@ -134,24 +128,18 @@ CPU_LE( rev32 v19.16b, v19.16b ) add dgbv.4s, dgbv.4s, dg1v.4s /* handled all input blocks? */ - cbz w21, 3f - - if_will_cond_yield_neon - st1 {dgav.4s, dgbv.4s}, [x19] - do_cond_yield_neon + cbz w2, 2f + cond_yield 3f, x5 b 0b - endif_yield_neon - - b 1b /* * Final block: add padding and total bit count. * Skip if the input size was not a round multiple of the block size, * the padding is handled by the C code in that case. */ -3: cbz x4, 4f +2: cbz x4, 3f ldr_l w4, sha256_ce_offsetof_count, x4 - ldr x4, [x19, x4] + ldr x4, [x0, x4] movi v17.2d, #0 mov x8, #0x80000000 movi v18.2d, #0 @@ -160,10 +148,10 @@ CPU_LE( rev32 v19.16b, v19.16b ) mov x4, #0 mov v19.d[0], xzr mov v19.d[1], x7 - b 2b + b 1b /* store new state */ -4: st1 {dgav.4s, dgbv.4s}, [x19] - frame_pop +3: st1 {dgav.4s, dgbv.4s}, [x0] + mov w0, w2 ret SYM_FUNC_END(sha2_ce_transform) diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index 31ba3da5e61b..c57a6119fefc 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -19,6 +19,8 @@ MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha224"); +MODULE_ALIAS_CRYPTO("sha256"); struct sha256_ce_state { struct sha256_state sst; @@ -28,14 +30,22 @@ struct sha256_ce_state { extern const u32 sha256_ce_offsetof_count; extern const u32 sha256_ce_offsetof_finalize; -asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src, - int blocks); +asmlinkage int sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src, + int blocks); static void __sha2_ce_transform(struct sha256_state *sst, u8 const *src, int blocks) { - sha2_ce_transform(container_of(sst, struct sha256_ce_state, sst), src, - blocks); + while (blocks) { + int rem; + + kernel_neon_begin(); + rem = sha2_ce_transform(container_of(sst, struct sha256_ce_state, + sst), src, blocks); + kernel_neon_end(); + src += (blocks - rem) * SHA256_BLOCK_SIZE; + blocks = rem; + } } const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state, @@ -61,9 +71,7 @@ static int sha256_ce_update(struct shash_desc *desc, const u8 *data, __sha256_block_data_order); sctx->finalize = 0; - kernel_neon_begin(); sha256_base_do_update(desc, data, len, __sha2_ce_transform); - kernel_neon_end(); return 0; } @@ -88,11 +96,9 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data, */ sctx->finalize = finalize; - kernel_neon_begin(); sha256_base_do_update(desc, data, len, __sha2_ce_transform); if (!finalize) sha256_base_do_finalize(desc, __sha2_ce_transform); - kernel_neon_end(); return sha256_base_finish(desc, out); } @@ -106,9 +112,7 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out) } sctx->finalize = 0; - kernel_neon_begin(); sha256_base_do_finalize(desc, __sha2_ce_transform); - kernel_neon_end(); return sha256_base_finish(desc, out); } diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S index 1cfb768df350..6f5208414fe3 100644 --- a/arch/arm64/crypto/sha3-ce-core.S +++ b/arch/arm64/crypto/sha3-ce-core.S @@ -37,20 +37,13 @@ .endm /* - * sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size) + * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size) */ .text SYM_FUNC_START(sha3_ce_transform) - frame_push 4 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - -0: /* load state */ - add x8, x19, #32 - ld1 { v0.1d- v3.1d}, [x19] + /* load state */ + add x8, x0, #32 + ld1 { v0.1d- v3.1d}, [x0] ld1 { v4.1d- v7.1d}, [x8], #32 ld1 { v8.1d-v11.1d}, [x8], #32 ld1 {v12.1d-v15.1d}, [x8], #32 @@ -58,13 +51,13 @@ SYM_FUNC_START(sha3_ce_transform) ld1 {v20.1d-v23.1d}, [x8], #32 ld1 {v24.1d}, [x8] -1: sub w21, w21, #1 +0: sub w2, w2, #1 mov w8, #24 adr_l x9, .Lsha3_rcon /* load input */ - ld1 {v25.8b-v28.8b}, [x20], #32 - ld1 {v29.8b-v31.8b}, [x20], #24 + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b-v31.8b}, [x1], #24 eor v0.8b, v0.8b, v25.8b eor v1.8b, v1.8b, v26.8b eor v2.8b, v2.8b, v27.8b @@ -73,10 +66,10 @@ SYM_FUNC_START(sha3_ce_transform) eor v5.8b, v5.8b, v30.8b eor v6.8b, v6.8b, v31.8b - tbnz x22, #6, 3f // SHA3-512 + tbnz x3, #6, 2f // SHA3-512 - ld1 {v25.8b-v28.8b}, [x20], #32 - ld1 {v29.8b-v30.8b}, [x20], #16 + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b-v30.8b}, [x1], #16 eor v7.8b, v7.8b, v25.8b eor v8.8b, v8.8b, v26.8b eor v9.8b, v9.8b, v27.8b @@ -84,34 +77,34 @@ SYM_FUNC_START(sha3_ce_transform) eor v11.8b, v11.8b, v29.8b eor v12.8b, v12.8b, v30.8b - tbnz x22, #4, 2f // SHA3-384 or SHA3-224 + tbnz x3, #4, 1f // SHA3-384 or SHA3-224 // SHA3-256 - ld1 {v25.8b-v28.8b}, [x20], #32 + ld1 {v25.8b-v28.8b}, [x1], #32 eor v13.8b, v13.8b, v25.8b eor v14.8b, v14.8b, v26.8b eor v15.8b, v15.8b, v27.8b eor v16.8b, v16.8b, v28.8b - b 4f + b 3f -2: tbz x22, #2, 4f // bit 2 cleared? SHA-384 +1: tbz x3, #2, 3f // bit 2 cleared? SHA-384 // SHA3-224 - ld1 {v25.8b-v28.8b}, [x20], #32 - ld1 {v29.8b}, [x20], #8 + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b}, [x1], #8 eor v13.8b, v13.8b, v25.8b eor v14.8b, v14.8b, v26.8b eor v15.8b, v15.8b, v27.8b eor v16.8b, v16.8b, v28.8b eor v17.8b, v17.8b, v29.8b - b 4f + b 3f // SHA3-512 -3: ld1 {v25.8b-v26.8b}, [x20], #16 +2: ld1 {v25.8b-v26.8b}, [x1], #16 eor v7.8b, v7.8b, v25.8b eor v8.8b, v8.8b, v26.8b -4: sub w8, w8, #1 +3: sub w8, w8, #1 eor3 v29.16b, v4.16b, v9.16b, v14.16b eor3 v26.16b, v1.16b, v6.16b, v11.16b @@ -190,33 +183,19 @@ SYM_FUNC_START(sha3_ce_transform) eor v0.16b, v0.16b, v31.16b - cbnz w8, 4b - cbz w21, 5f - - if_will_cond_yield_neon - add x8, x19, #32 - st1 { v0.1d- v3.1d}, [x19] - st1 { v4.1d- v7.1d}, [x8], #32 - st1 { v8.1d-v11.1d}, [x8], #32 - st1 {v12.1d-v15.1d}, [x8], #32 - st1 {v16.1d-v19.1d}, [x8], #32 - st1 {v20.1d-v23.1d}, [x8], #32 - st1 {v24.1d}, [x8] - do_cond_yield_neon - b 0b - endif_yield_neon - - b 1b + cbnz w8, 3b + cond_yield 3f, x8 + cbnz w2, 0b /* save state */ -5: st1 { v0.1d- v3.1d}, [x19], #32 - st1 { v4.1d- v7.1d}, [x19], #32 - st1 { v8.1d-v11.1d}, [x19], #32 - st1 {v12.1d-v15.1d}, [x19], #32 - st1 {v16.1d-v19.1d}, [x19], #32 - st1 {v20.1d-v23.1d}, [x19], #32 - st1 {v24.1d}, [x19] - frame_pop +3: st1 { v0.1d- v3.1d}, [x0], #32 + st1 { v4.1d- v7.1d}, [x0], #32 + st1 { v8.1d-v11.1d}, [x0], #32 + st1 {v12.1d-v15.1d}, [x0], #32 + st1 {v16.1d-v19.1d}, [x0], #32 + st1 {v20.1d-v23.1d}, [x0], #32 + st1 {v24.1d}, [x0] + mov w0, w2 ret SYM_FUNC_END(sha3_ce_transform) diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c index e5a2936f0886..8c65cecf560a 100644 --- a/arch/arm64/crypto/sha3-ce-glue.c +++ b/arch/arm64/crypto/sha3-ce-glue.c @@ -23,9 +23,13 @@ MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha3-224"); +MODULE_ALIAS_CRYPTO("sha3-256"); +MODULE_ALIAS_CRYPTO("sha3-384"); +MODULE_ALIAS_CRYPTO("sha3-512"); -asmlinkage void sha3_ce_transform(u64 *st, const u8 *data, int blocks, - int md_len); +asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks, + int md_len); static int sha3_update(struct shash_desc *desc, const u8 *data, unsigned int len) @@ -55,11 +59,15 @@ static int sha3_update(struct shash_desc *desc, const u8 *data, blocks = len / sctx->rsiz; len %= sctx->rsiz; - if (blocks) { + while (blocks) { + int rem; + kernel_neon_begin(); - sha3_ce_transform(sctx->st, data, blocks, digest_size); + rem = sha3_ce_transform(sctx->st, data, blocks, + digest_size); kernel_neon_end(); - data += blocks * sctx->rsiz; + data += (blocks - rem) * sctx->rsiz; + blocks = rem; } } diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S index cde606c0323e..d6e7f6c95fa6 100644 --- a/arch/arm64/crypto/sha512-ce-core.S +++ b/arch/arm64/crypto/sha512-ce-core.S @@ -107,23 +107,17 @@ */ .text SYM_FUNC_START(sha512_ce_transform) - frame_push 3 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - /* load state */ -0: ld1 {v8.2d-v11.2d}, [x19] + ld1 {v8.2d-v11.2d}, [x0] /* load first 4 round constants */ adr_l x3, .Lsha512_rcon ld1 {v20.2d-v23.2d}, [x3], #64 /* load input */ -1: ld1 {v12.2d-v15.2d}, [x20], #64 - ld1 {v16.2d-v19.2d}, [x20], #64 - sub w21, w21, #1 +0: ld1 {v12.2d-v15.2d}, [x1], #64 + ld1 {v16.2d-v19.2d}, [x1], #64 + sub w2, w2, #1 CPU_LE( rev64 v12.16b, v12.16b ) CPU_LE( rev64 v13.16b, v13.16b ) @@ -201,19 +195,12 @@ CPU_LE( rev64 v19.16b, v19.16b ) add v10.2d, v10.2d, v2.2d add v11.2d, v11.2d, v3.2d + cond_yield 3f, x4 /* handled all input blocks? */ - cbz w21, 3f - - if_will_cond_yield_neon - st1 {v8.2d-v11.2d}, [x19] - do_cond_yield_neon - b 0b - endif_yield_neon - - b 1b + cbnz w2, 0b /* store new state */ -3: st1 {v8.2d-v11.2d}, [x19] - frame_pop +3: st1 {v8.2d-v11.2d}, [x0] + mov w0, w2 ret SYM_FUNC_END(sha512_ce_transform) diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c index faa83f6cf376..e62a094a9d52 100644 --- a/arch/arm64/crypto/sha512-ce-glue.c +++ b/arch/arm64/crypto/sha512-ce-glue.c @@ -23,12 +23,28 @@ MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha384"); +MODULE_ALIAS_CRYPTO("sha512"); -asmlinkage void sha512_ce_transform(struct sha512_state *sst, u8 const *src, - int blocks); +asmlinkage int sha512_ce_transform(struct sha512_state *sst, u8 const *src, + int blocks); asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks); +static void __sha512_ce_transform(struct sha512_state *sst, u8 const *src, + int blocks) +{ + while (blocks) { + int rem; + + kernel_neon_begin(); + rem = sha512_ce_transform(sst, src, blocks); + kernel_neon_end(); + src += (blocks - rem) * SHA512_BLOCK_SIZE; + blocks = rem; + } +} + static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src, int blocks) { @@ -38,45 +54,30 @@ static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src, static int sha512_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - if (!crypto_simd_usable()) - return sha512_base_do_update(desc, data, len, - __sha512_block_data_order); - - kernel_neon_begin(); - sha512_base_do_update(desc, data, len, sha512_ce_transform); - kernel_neon_end(); + sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform + : __sha512_block_data_order; + sha512_base_do_update(desc, data, len, fn); return 0; } static int sha512_ce_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!crypto_simd_usable()) { - if (len) - sha512_base_do_update(desc, data, len, - __sha512_block_data_order); - sha512_base_do_finalize(desc, __sha512_block_data_order); - return sha512_base_finish(desc, out); - } + sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform + : __sha512_block_data_order; - kernel_neon_begin(); - sha512_base_do_update(desc, data, len, sha512_ce_transform); - sha512_base_do_finalize(desc, sha512_ce_transform); - kernel_neon_end(); + sha512_base_do_update(desc, data, len, fn); + sha512_base_do_finalize(desc, fn); return sha512_base_finish(desc, out); } static int sha512_ce_final(struct shash_desc *desc, u8 *out) { - if (!crypto_simd_usable()) { - sha512_base_do_finalize(desc, __sha512_block_data_order); - return sha512_base_finish(desc, out); - } + sha512_block_fn *fn = crypto_simd_usable() ? __sha512_ce_transform + : __sha512_block_data_order; - kernel_neon_begin(); - sha512_base_do_finalize(desc, sha512_ce_transform); - kernel_neon_end(); + sha512_base_do_finalize(desc, fn); return sha512_base_finish(desc, out); } |