summaryrefslogtreecommitdiff
path: root/arch/arm64/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/crypto')
-rw-r--r--arch/arm64/crypto/Kconfig6
-rw-r--r--arch/arm64/crypto/Makefile3
-rw-r--r--arch/arm64/crypto/sm4-ce-core.S660
-rw-r--r--arch/arm64/crypto/sm4-ce-glue.c372
4 files changed, 1041 insertions, 0 deletions
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index d62dd54d1800..4fe7037d2347 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -53,6 +53,12 @@ config CRYPTO_SM4_ARM64_CE
select CRYPTO_ALGAPI
select CRYPTO_SM4
+config CRYPTO_SM4_ARM64_CE_BLK
+ tristate "SM4 in ECB/CBC/CFB/CTR modes using ARMv8 Crypto Extensions"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_SM4
+
config CRYPTO_SM4_ARM64_NEON_BLK
tristate "SM4 in ECB/CBC/CFB/CTR modes using NEON instructions"
depends on KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 41aee6103e78..bea8995133b1 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -23,6 +23,9 @@ sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o
obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce-cipher.o
sm4-ce-cipher-y := sm4-ce-cipher-glue.o sm4-ce-cipher-core.o
+obj-$(CONFIG_CRYPTO_SM4_ARM64_CE_BLK) += sm4-ce.o
+sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o
+
obj-$(CONFIG_CRYPTO_SM4_ARM64_NEON_BLK) += sm4-neon.o
sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o
diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S
new file mode 100644
index 000000000000..934e0f093279
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-core.S
@@ -0,0 +1,660 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+.arch armv8-a+crypto
+
+.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31
+ .set .Lv\b\().4s, \b
+.endr
+
+.macro sm4e, vd, vn
+ .inst 0xcec08400 | (.L\vn << 5) | .L\vd
+.endm
+
+.macro sm4ekey, vd, vn, vm
+ .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
+.endm
+
+/* Register macros */
+
+#define RTMP0 v16
+#define RTMP1 v17
+#define RTMP2 v18
+#define RTMP3 v19
+
+#define RIV v20
+
+/* Helper macros. */
+
+#define PREPARE \
+ ld1 {v24.16b-v27.16b}, [x0], #64; \
+ ld1 {v28.16b-v31.16b}, [x0];
+
+#define SM4_CRYPT_BLK(b0) \
+ rev32 b0.16b, b0.16b; \
+ sm4e b0.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ rev32 b0.16b, b0.16b;
+
+#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ sm4e b0.4s, v24.4s; \
+ sm4e b1.4s, v24.4s; \
+ sm4e b2.4s, v24.4s; \
+ sm4e b3.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b1.4s, v25.4s; \
+ sm4e b2.4s, v25.4s; \
+ sm4e b3.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b1.4s, v26.4s; \
+ sm4e b2.4s, v26.4s; \
+ sm4e b3.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b1.4s, v27.4s; \
+ sm4e b2.4s, v27.4s; \
+ sm4e b3.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b1.4s, v28.4s; \
+ sm4e b2.4s, v28.4s; \
+ sm4e b3.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b1.4s, v29.4s; \
+ sm4e b2.4s, v29.4s; \
+ sm4e b3.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b1.4s, v30.4s; \
+ sm4e b2.4s, v30.4s; \
+ sm4e b3.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ sm4e b1.4s, v31.4s; \
+ sm4e b2.4s, v31.4s; \
+ sm4e b3.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ rev64 b1.4s, b1.4s; \
+ rev64 b2.4s, b2.4s; \
+ rev64 b3.4s, b3.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ ext b1.16b, b1.16b, b1.16b, #8; \
+ ext b2.16b, b2.16b, b2.16b, #8; \
+ ext b3.16b, b3.16b, b3.16b, #8; \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b;
+
+#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b; \
+ sm4e b0.4s, v24.4s; \
+ sm4e b1.4s, v24.4s; \
+ sm4e b2.4s, v24.4s; \
+ sm4e b3.4s, v24.4s; \
+ sm4e b4.4s, v24.4s; \
+ sm4e b5.4s, v24.4s; \
+ sm4e b6.4s, v24.4s; \
+ sm4e b7.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b1.4s, v25.4s; \
+ sm4e b2.4s, v25.4s; \
+ sm4e b3.4s, v25.4s; \
+ sm4e b4.4s, v25.4s; \
+ sm4e b5.4s, v25.4s; \
+ sm4e b6.4s, v25.4s; \
+ sm4e b7.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b1.4s, v26.4s; \
+ sm4e b2.4s, v26.4s; \
+ sm4e b3.4s, v26.4s; \
+ sm4e b4.4s, v26.4s; \
+ sm4e b5.4s, v26.4s; \
+ sm4e b6.4s, v26.4s; \
+ sm4e b7.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b1.4s, v27.4s; \
+ sm4e b2.4s, v27.4s; \
+ sm4e b3.4s, v27.4s; \
+ sm4e b4.4s, v27.4s; \
+ sm4e b5.4s, v27.4s; \
+ sm4e b6.4s, v27.4s; \
+ sm4e b7.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b1.4s, v28.4s; \
+ sm4e b2.4s, v28.4s; \
+ sm4e b3.4s, v28.4s; \
+ sm4e b4.4s, v28.4s; \
+ sm4e b5.4s, v28.4s; \
+ sm4e b6.4s, v28.4s; \
+ sm4e b7.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b1.4s, v29.4s; \
+ sm4e b2.4s, v29.4s; \
+ sm4e b3.4s, v29.4s; \
+ sm4e b4.4s, v29.4s; \
+ sm4e b5.4s, v29.4s; \
+ sm4e b6.4s, v29.4s; \
+ sm4e b7.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b1.4s, v30.4s; \
+ sm4e b2.4s, v30.4s; \
+ sm4e b3.4s, v30.4s; \
+ sm4e b4.4s, v30.4s; \
+ sm4e b5.4s, v30.4s; \
+ sm4e b6.4s, v30.4s; \
+ sm4e b7.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ sm4e b1.4s, v31.4s; \
+ sm4e b2.4s, v31.4s; \
+ sm4e b3.4s, v31.4s; \
+ sm4e b4.4s, v31.4s; \
+ sm4e b5.4s, v31.4s; \
+ sm4e b6.4s, v31.4s; \
+ sm4e b7.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ rev64 b1.4s, b1.4s; \
+ rev64 b2.4s, b2.4s; \
+ rev64 b3.4s, b3.4s; \
+ rev64 b4.4s, b4.4s; \
+ rev64 b5.4s, b5.4s; \
+ rev64 b6.4s, b6.4s; \
+ rev64 b7.4s, b7.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ ext b1.16b, b1.16b, b1.16b, #8; \
+ ext b2.16b, b2.16b, b2.16b, #8; \
+ ext b3.16b, b3.16b, b3.16b, #8; \
+ ext b4.16b, b4.16b, b4.16b, #8; \
+ ext b5.16b, b5.16b, b5.16b, #8; \
+ ext b6.16b, b6.16b, b6.16b, #8; \
+ ext b7.16b, b7.16b, b7.16b, #8; \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b;
+
+
+.align 3
+SYM_FUNC_START(sm4_ce_expand_key)
+ /* input:
+ * x0: 128-bit key
+ * x1: rkey_enc
+ * x2: rkey_dec
+ * x3: fk array
+ * x4: ck array
+ */
+ ld1 {v0.16b}, [x0];
+ rev32 v0.16b, v0.16b;
+ ld1 {v1.16b}, [x3];
+ /* load ck */
+ ld1 {v24.16b-v27.16b}, [x4], #64;
+ ld1 {v28.16b-v31.16b}, [x4];
+
+ /* input ^ fk */
+ eor v0.16b, v0.16b, v1.16b;
+
+ sm4ekey v0.4s, v0.4s, v24.4s;
+ sm4ekey v1.4s, v0.4s, v25.4s;
+ sm4ekey v2.4s, v1.4s, v26.4s;
+ sm4ekey v3.4s, v2.4s, v27.4s;
+ sm4ekey v4.4s, v3.4s, v28.4s;
+ sm4ekey v5.4s, v4.4s, v29.4s;
+ sm4ekey v6.4s, v5.4s, v30.4s;
+ sm4ekey v7.4s, v6.4s, v31.4s;
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b-v7.16b}, [x1];
+ rev64 v7.4s, v7.4s;
+ rev64 v6.4s, v6.4s;
+ rev64 v5.4s, v5.4s;
+ rev64 v4.4s, v4.4s;
+ rev64 v3.4s, v3.4s;
+ rev64 v2.4s, v2.4s;
+ rev64 v1.4s, v1.4s;
+ rev64 v0.4s, v0.4s;
+ ext v7.16b, v7.16b, v7.16b, #8;
+ ext v6.16b, v6.16b, v6.16b, #8;
+ ext v5.16b, v5.16b, v5.16b, #8;
+ ext v4.16b, v4.16b, v4.16b, #8;
+ ext v3.16b, v3.16b, v3.16b, #8;
+ ext v2.16b, v2.16b, v2.16b, #8;
+ ext v1.16b, v1.16b, v1.16b, #8;
+ ext v0.16b, v0.16b, v0.16b, #8;
+ st1 {v7.16b}, [x2], #16;
+ st1 {v6.16b}, [x2], #16;
+ st1 {v5.16b}, [x2], #16;
+ st1 {v4.16b}, [x2], #16;
+ st1 {v3.16b}, [x2], #16;
+ st1 {v2.16b}, [x2], #16;
+ st1 {v1.16b}, [x2], #16;
+ st1 {v0.16b}, [x2];
+
+ ret;
+SYM_FUNC_END(sm4_ce_expand_key)
+
+.align 3
+SYM_FUNC_START(sm4_ce_crypt_block)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ */
+ PREPARE;
+
+ ld1 {v0.16b}, [x2];
+ SM4_CRYPT_BLK(v0);
+ st1 {v0.16b}, [x1];
+
+ ret;
+SYM_FUNC_END(sm4_ce_crypt_block)
+
+.align 3
+SYM_FUNC_START(sm4_ce_crypt)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * w3: nblocks
+ */
+ PREPARE;
+
+.Lcrypt_loop_blk:
+ sub w3, w3, #8;
+ tbnz w3, #31, .Lcrypt_tail8;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b-v7.16b}, [x2], #64;
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ cbz w3, .Lcrypt_end;
+ b .Lcrypt_loop_blk;
+
+.Lcrypt_tail8:
+ add w3, w3, #8;
+ cmp w3, #4;
+ blt .Lcrypt_tail4;
+
+ sub w3, w3, #4;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ cbz w3, .Lcrypt_end;
+
+.Lcrypt_tail4:
+ sub w3, w3, #1;
+
+ ld1 {v0.16b}, [x2], #16;
+ SM4_CRYPT_BLK(v0);
+ st1 {v0.16b}, [x1], #16;
+
+ cbnz w3, .Lcrypt_tail4;
+
+.Lcrypt_end:
+ ret;
+SYM_FUNC_END(sm4_ce_crypt)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cbc_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ld1 {RIV.16b}, [x3];
+
+.Lcbc_enc_loop:
+ sub w4, w4, #1;
+
+ ld1 {RTMP0.16b}, [x2], #16;
+ eor RIV.16b, RIV.16b, RTMP0.16b;
+
+ SM4_CRYPT_BLK(RIV);
+
+ st1 {RIV.16b}, [x1], #16;
+
+ cbnz w4, .Lcbc_enc_loop;
+
+ /* store new IV */
+ st1 {RIV.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_cbc_enc)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cbc_dec)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ld1 {RIV.16b}, [x3];
+
+.Lcbc_loop_blk:
+ sub w4, w4, #8;
+ tbnz w4, #31, .Lcbc_tail8;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b-v7.16b}, [x2];
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ sub x2, x2, #64;
+ eor v0.16b, v0.16b, RIV.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v1.16b, v1.16b, RTMP0.16b;
+ eor v2.16b, v2.16b, RTMP1.16b;
+ eor v3.16b, v3.16b, RTMP2.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ eor v4.16b, v4.16b, RTMP3.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v5.16b, v5.16b, RTMP0.16b;
+ eor v6.16b, v6.16b, RTMP1.16b;
+ eor v7.16b, v7.16b, RTMP2.16b;
+
+ mov RIV.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ cbz w4, .Lcbc_end;
+ b .Lcbc_loop_blk;
+
+.Lcbc_tail8:
+ add w4, w4, #8;
+ cmp w4, #4;
+ blt .Lcbc_tail4;
+
+ sub w4, w4, #4;
+
+ ld1 {v0.16b-v3.16b}, [x2];
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+
+ eor v0.16b, v0.16b, RIV.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v1.16b, v1.16b, RTMP0.16b;
+ eor v2.16b, v2.16b, RTMP1.16b;
+ eor v3.16b, v3.16b, RTMP2.16b;
+
+ mov RIV.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ cbz w4, .Lcbc_end;
+
+.Lcbc_tail4:
+ sub w4, w4, #1;
+
+ ld1 {v0.16b}, [x2];
+
+ SM4_CRYPT_BLK(v0);
+
+ eor v0.16b, v0.16b, RIV.16b;
+ ld1 {RIV.16b}, [x2], #16;
+ st1 {v0.16b}, [x1], #16;
+
+ cbnz w4, .Lcbc_tail4;
+
+.Lcbc_end:
+ /* store new IV */
+ st1 {RIV.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_cbc_dec)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cfb_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ld1 {RIV.16b}, [x3];
+
+.Lcfb_enc_loop:
+ sub w4, w4, #1;
+
+ SM4_CRYPT_BLK(RIV);
+
+ ld1 {RTMP0.16b}, [x2], #16;
+ eor RIV.16b, RIV.16b, RTMP0.16b;
+ st1 {RIV.16b}, [x1], #16;
+
+ cbnz w4, .Lcfb_enc_loop;
+
+ /* store new IV */
+ st1 {RIV.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_cfb_enc)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cfb_dec)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ld1 {v0.16b}, [x3];
+
+.Lcfb_loop_blk:
+ sub w4, w4, #8;
+ tbnz w4, #31, .Lcfb_tail8;
+
+ ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
+ ld1 {v4.16b-v7.16b}, [x2];
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ sub x2, x2, #48;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v4.16b, v4.16b, RTMP0.16b;
+ eor v5.16b, v5.16b, RTMP1.16b;
+ eor v6.16b, v6.16b, RTMP2.16b;
+ eor v7.16b, v7.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ mov v0.16b, RTMP3.16b;
+
+ cbz w4, .Lcfb_end;
+ b .Lcfb_loop_blk;
+
+.Lcfb_tail8:
+ add w4, w4, #8;
+ cmp w4, #4;
+ blt .Lcfb_tail4;
+
+ sub w4, w4, #4;
+
+ ld1 {v1.16b, v2.16b, v3.16b}, [x2];
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ mov v0.16b, RTMP3.16b;
+
+ cbz w4, .Lcfb_end;
+
+.Lcfb_tail4:
+ sub w4, w4, #1;
+
+ SM4_CRYPT_BLK(v0);
+
+ ld1 {RTMP0.16b}, [x2], #16;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ st1 {v0.16b}, [x1], #16;
+
+ mov v0.16b, RTMP0.16b;
+
+ cbnz w4, .Lcfb_tail4;
+
+.Lcfb_end:
+ /* store new IV */
+ st1 {v0.16b}, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_cfb_dec)
+
+.align 3
+SYM_FUNC_START(sm4_ce_ctr_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: ctr (big endian, 128 bit)
+ * w4: nblocks
+ */
+ PREPARE;
+
+ ldp x7, x8, [x3];
+ rev x7, x7;
+ rev x8, x8;
+
+.Lctr_loop_blk:
+ sub w4, w4, #8;
+ tbnz w4, #31, .Lctr_tail8;
+
+#define inc_le128(vctr) \
+ mov vctr.d[1], x8; \
+ mov vctr.d[0], x7; \
+ adds x8, x8, #1; \
+ adc x7, x7, xzr; \
+ rev64 vctr.16b, vctr.16b;
+
+ /* construct CTRs */
+ inc_le128(v0); /* +0 */
+ inc_le128(v1); /* +1 */
+ inc_le128(v2); /* +2 */
+ inc_le128(v3); /* +3 */
+ inc_le128(v4); /* +4 */
+ inc_le128(v5); /* +5 */
+ inc_le128(v6); /* +6 */
+ inc_le128(v7); /* +7 */
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v4.16b, v4.16b, RTMP0.16b;
+ eor v5.16b, v5.16b, RTMP1.16b;
+ eor v6.16b, v6.16b, RTMP2.16b;
+ eor v7.16b, v7.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ cbz w4, .Lctr_end;
+ b .Lctr_loop_blk;
+
+.Lctr_tail8:
+ add w4, w4, #8;
+ cmp w4, #4;
+ blt .Lctr_tail4;
+
+ sub w4, w4, #4;
+
+ /* construct CTRs */
+ inc_le128(v0); /* +0 */
+ inc_le128(v1); /* +1 */
+ inc_le128(v2); /* +2 */
+ inc_le128(v3); /* +3 */
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ cbz w4, .Lctr_end;
+
+.Lctr_tail4:
+ sub w4, w4, #1;
+
+ /* construct CTRs */
+ inc_le128(v0);
+
+ SM4_CRYPT_BLK(v0);
+
+ ld1 {RTMP0.16b}, [x2], #16;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ st1 {v0.16b}, [x1], #16;
+
+ cbnz w4, .Lctr_tail4;
+
+.Lctr_end:
+ /* store new CTR */
+ rev x7, x7;
+ rev x8, x8;
+ stp x7, x8, [x3];
+
+ ret;
+SYM_FUNC_END(sm4_ce_ctr_enc)
diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c
new file mode 100644
index 000000000000..496d55c0d01a
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-glue.c
@@ -0,0 +1,372 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, using ARMv8 Crypto Extensions
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+
+#define BYTES2BLKS(nbytes) ((nbytes) >> 4)
+
+asmlinkage void sm4_ce_expand_key(const u8 *key, u32 *rkey_enc, u32 *rkey_dec,
+ const u32 *fk, const u32 *ck);
+asmlinkage void sm4_ce_crypt_block(const u32 *rkey, u8 *dst, const u8 *src);
+asmlinkage void sm4_ce_crypt(const u32 *rkey, u8 *dst, const u8 *src,
+ unsigned int nblks);
+asmlinkage void sm4_ce_cbc_enc(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_cbc_dec(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_cfb_enc(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_ctr_enc(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+
+static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ if (key_len != SM4_KEY_SIZE)
+ return -EINVAL;
+
+ sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec,
+ crypto_sm4_fk, crypto_sm4_ck);
+ return 0;
+}
+
+static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey)
+{
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_crypt(rkey, dst, src, nblks);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_enc);
+}
+
+static int sm4_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_dec);
+}
+
+static int sm4_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_cbc_enc(ctx->rkey_enc, dst, src, walk.iv, nblks);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_cbc_dec(ctx->rkey_dec, dst, src, walk.iv, nblks);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_cfb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_cfb_enc(ctx->rkey_enc, dst, src, walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_cfb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_cfb_dec(ctx->rkey_enc, dst, src, walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_ctr_enc(ctx->rkey_enc, dst, src, walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static struct skcipher_alg sm4_algs[] = {
+ {
+ .base = {
+ .cra_name = "ecb(sm4)",
+ .cra_driver_name = "ecb-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ecb_encrypt,
+ .decrypt = sm4_ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cbc(sm4)",
+ .cra_driver_name = "cbc-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_cbc_encrypt,
+ .decrypt = sm4_cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cfb(sm4)",
+ .cra_driver_name = "cfb-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_cfb_encrypt,
+ .decrypt = sm4_cfb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "ctr(sm4)",
+ .cra_driver_name = "ctr-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ctr_crypt,
+ .decrypt = sm4_ctr_crypt,
+ }
+};
+
+static int __init sm4_init(void)
+{
+ return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
+}
+
+static void __exit sm4_exit(void)
+{
+ crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
+}
+
+module_cpu_feature_match(SM4, sm4_init);
+module_exit(sm4_exit);
+
+MODULE_DESCRIPTION("SM4 ECB/CBC/CFB/CTR using ARMv8 Crypto Extensions");
+MODULE_ALIAS_CRYPTO("sm4-ce");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("ecb(sm4)");
+MODULE_ALIAS_CRYPTO("cbc(sm4)");
+MODULE_ALIAS_CRYPTO("cfb(sm4)");
+MODULE_ALIAS_CRYPTO("ctr(sm4)");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_LICENSE("GPL v2");