summaryrefslogtreecommitdiff
path: root/arch/arm64/crypto/speck-neon-core.S
blob: b14463438b0966b6bc37f2f7784b0285c51ce290 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
// SPDX-License-Identifier: GPL-2.0
/*
 * ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
 *
 * Copyright (c) 2018 Google, Inc
 *
 * Author: Eric Biggers <ebiggers@google.com>
 */

#include <linux/linkage.h>

	.text

	// arguments
	ROUND_KEYS	.req	x0	// const {u64,u32} *round_keys
	NROUNDS		.req	w1	// int nrounds
	NROUNDS_X	.req	x1
	DST		.req	x2	// void *dst
	SRC		.req	x3	// const void *src
	NBYTES		.req	w4	// unsigned int nbytes
	TWEAK		.req	x5	// void *tweak

	// registers which hold the data being encrypted/decrypted
	// (underscores avoid a naming collision with ARM64 registers x0-x3)
	X_0		.req	v0
	Y_0		.req	v1
	X_1		.req	v2
	Y_1		.req	v3
	X_2		.req	v4
	Y_2		.req	v5
	X_3		.req	v6
	Y_3		.req	v7

	// the round key, duplicated in all lanes
	ROUND_KEY	.req	v8

	// index vector for tbl-based 8-bit rotates
	ROTATE_TABLE	.req	v9
	ROTATE_TABLE_Q	.req	q9

	// temporary registers
	TMP0		.req	v10
	TMP1		.req	v11
	TMP2		.req	v12
	TMP3		.req	v13

	// multiplication table for updating XTS tweaks
	GFMUL_TABLE	.req	v14
	GFMUL_TABLE_Q	.req	q14

	// next XTS tweak value(s)
	TWEAKV_NEXT	.req	v15

	// XTS tweaks for the blocks currently being encrypted/decrypted
	TWEAKV0		.req	v16
	TWEAKV1		.req	v17
	TWEAKV2		.req	v18
	TWEAKV3		.req	v19
	TWEAKV4		.req	v20
	TWEAKV5		.req	v21
	TWEAKV6		.req	v22
	TWEAKV7		.req	v23

	.align		4
.Lror64_8_table:
	.octa		0x080f0e0d0c0b0a090007060504030201
.Lror32_8_table:
	.octa		0x0c0f0e0d080b0a090407060500030201
.Lrol64_8_table:
	.octa		0x0e0d0c0b0a09080f0605040302010007
.Lrol32_8_table:
	.octa		0x0e0d0c0f0a09080b0605040702010003
.Lgf128mul_table:
	.octa		0x00000000000000870000000000000001
.Lgf64mul_table:
	.octa		0x0000000000000000000000002d361b00

/*
 * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
 *
 * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
 * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
 * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
 * 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
 */
.macro _speck_round_128bytes	n, lanes

	// x = ror(x, 8)
	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b

	// x += y
	add		X_0.\lanes, X_0.\lanes, Y_0.\lanes
	add		X_1.\lanes, X_1.\lanes, Y_1.\lanes
	add		X_2.\lanes, X_2.\lanes, Y_2.\lanes
	add		X_3.\lanes, X_3.\lanes, Y_3.\lanes

	// x ^= k
	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
	eor		X_3.16b, X_3.16b, ROUND_KEY.16b

	// y = rol(y, 3)
	shl		TMP0.\lanes, Y_0.\lanes, #3
	shl		TMP1.\lanes, Y_1.\lanes, #3
	shl		TMP2.\lanes, Y_2.\lanes, #3
	shl		TMP3.\lanes, Y_3.\lanes, #3
	sri		TMP0.\lanes, Y_0.\lanes, #(\n - 3)
	sri		TMP1.\lanes, Y_1.\lanes, #(\n - 3)
	sri		TMP2.\lanes, Y_2.\lanes, #(\n - 3)
	sri		TMP3.\lanes, Y_3.\lanes, #(\n - 3)

	// y ^= x
	eor		Y_0.16b, TMP0.16b, X_0.16b
	eor		Y_1.16b, TMP1.16b, X_1.16b
	eor		Y_2.16b, TMP2.16b, X_2.16b
	eor		Y_3.16b, TMP3.16b, X_3.16b
.endm

/*
 * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
 *
 * This is the inverse of _speck_round_128bytes().
 */
.macro _speck_unround_128bytes	n, lanes

	// y ^= x
	eor		TMP0.16b, Y_0.16b, X_0.16b
	eor		TMP1.16b, Y_1.16b, X_1.16b
	eor		TMP2.16b, Y_2.16b, X_2.16b
	eor		TMP3.16b, Y_3.16b, X_3.16b

	// y = ror(y, 3)
	ushr		Y_0.\lanes, TMP0.\lanes, #3
	ushr		Y_1.\lanes, TMP1.\lanes, #3
	ushr		Y_2.\lanes, TMP2.\lanes, #3
	ushr		Y_3.\lanes, TMP3.\lanes, #3
	sli		Y_0.\lanes, TMP0.\lanes, #(\n - 3)
	sli		Y_1.\lanes, TMP1.\lanes, #(\n - 3)
	sli		Y_2.\lanes, TMP2.\lanes, #(\n - 3)
	sli		Y_3.\lanes, TMP3.\lanes, #(\n - 3)

	// x ^= k
	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
	eor		X_3.16b, X_3.16b, ROUND_KEY.16b

	// x -= y
	sub		X_0.\lanes, X_0.\lanes, Y_0.\lanes
	sub		X_1.\lanes, X_1.\lanes, Y_1.\lanes
	sub		X_2.\lanes, X_2.\lanes, Y_2.\lanes
	sub		X_3.\lanes, X_3.\lanes, Y_3.\lanes

	// x = rol(x, 8)
	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
.endm

.macro _next_xts_tweak	next, cur, tmp, n
.if \n == 64
	/*
	 * Calculate the next tweak by multiplying the current one by x,
	 * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
	 */
	sshr		\tmp\().2d, \cur\().2d, #63
	and		\tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
	shl		\next\().2d, \cur\().2d, #1
	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
	eor		\next\().16b, \next\().16b, \tmp\().16b
.else
	/*
	 * Calculate the next two tweaks by multiplying the current ones by x^2,
	 * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
	 */
	ushr		\tmp\().2d, \cur\().2d, #62
	shl		\next\().2d, \cur\().2d, #2
	tbl		\tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
	eor		\next\().16b, \next\().16b, \tmp\().16b
.endif
.endm

/*
 * _speck_xts_crypt() - Speck-XTS encryption/decryption
 *
 * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
 * using Speck-XTS, specifically the variant with a block size of '2n' and round
 * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
 * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
 * nonzero multiple of 128.
 */
.macro _speck_xts_crypt	n, lanes, decrypting

	/*
	 * If decrypting, modify the ROUND_KEYS parameter to point to the last
	 * round key rather than the first, since for decryption the round keys
	 * are used in reverse order.
	 */
.if \decrypting
	mov		NROUNDS, NROUNDS	/* zero the high 32 bits */
.if \n == 64
	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
	sub		ROUND_KEYS, ROUND_KEYS, #8
.else
	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
	sub		ROUND_KEYS, ROUND_KEYS, #4
.endif
.endif

	// Load the index vector for tbl-based 8-bit rotates
.if \decrypting
	ldr		ROTATE_TABLE_Q, .Lrol\n\()_8_table
.else
	ldr		ROTATE_TABLE_Q, .Lror\n\()_8_table
.endif

	// One-time XTS preparation
.if \n == 64
	// Load first tweak
	ld1		{TWEAKV0.16b}, [TWEAK]

	// Load GF(2^128) multiplication table
	ldr		GFMUL_TABLE_Q, .Lgf128mul_table
.else
	// Load first tweak
	ld1		{TWEAKV0.8b}, [TWEAK]

	// Load GF(2^64) multiplication table
	ldr		GFMUL_TABLE_Q, .Lgf64mul_table

	// Calculate second tweak, packing it together with the first
	ushr		TMP0.2d, TWEAKV0.2d, #63
	shl		TMP1.2d, TWEAKV0.2d, #1
	tbl		TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
	eor		TMP0.8b, TMP0.8b, TMP1.8b
	mov		TWEAKV0.d[1], TMP0.d[0]
.endif

.Lnext_128bytes_\@:

	// Calculate XTS tweaks for next 128 bytes
	_next_xts_tweak	TWEAKV1, TWEAKV0, TMP0, \n
	_next_xts_tweak	TWEAKV2, TWEAKV1, TMP0, \n
	_next_xts_tweak	TWEAKV3, TWEAKV2, TMP0, \n
	_next_xts_tweak	TWEAKV4, TWEAKV3, TMP0, \n
	_next_xts_tweak	TWEAKV5, TWEAKV4, TMP0, \n
	_next_xts_tweak	TWEAKV6, TWEAKV5, TMP0, \n
	_next_xts_tweak	TWEAKV7, TWEAKV6, TMP0, \n
	_next_xts_tweak	TWEAKV_NEXT, TWEAKV7, TMP0, \n

	// Load the next source blocks into {X,Y}[0-3]
	ld1		{X_0.16b-Y_1.16b}, [SRC], #64
	ld1		{X_2.16b-Y_3.16b}, [SRC], #64

	// XOR the source blocks with their XTS tweaks
	eor		TMP0.16b, X_0.16b, TWEAKV0.16b
	eor		Y_0.16b,  Y_0.16b, TWEAKV1.16b
	eor		TMP1.16b, X_1.16b, TWEAKV2.16b
	eor		Y_1.16b,  Y_1.16b, TWEAKV3.16b
	eor		TMP2.16b, X_2.16b, TWEAKV4.16b
	eor		Y_2.16b,  Y_2.16b, TWEAKV5.16b
	eor		TMP3.16b, X_3.16b, TWEAKV6.16b
	eor		Y_3.16b,  Y_3.16b, TWEAKV7.16b

	/*
	 * De-interleave the 'x' and 'y' elements of each block, i.e. make it so
	 * that the X[0-3] registers contain only the second halves of blocks,
	 * and the Y[0-3] registers contain only the first halves of blocks.
	 * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
	 */
	uzp2		X_0.\lanes, TMP0.\lanes, Y_0.\lanes
	uzp1		Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
	uzp2		X_1.\lanes, TMP1.\lanes, Y_1.\lanes
	uzp1		Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
	uzp2		X_2.\lanes, TMP2.\lanes, Y_2.\lanes
	uzp1		Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
	uzp2		X_3.\lanes, TMP3.\lanes, Y_3.\lanes
	uzp1		Y_3.\lanes, TMP3.\lanes, Y_3.\lanes

	// Do the cipher rounds
	mov		x6, ROUND_KEYS
	mov		w7, NROUNDS
.Lnext_round_\@:
.if \decrypting
	ld1r		{ROUND_KEY.\lanes}, [x6]
	sub		x6, x6, #( \n / 8 )
	_speck_unround_128bytes	\n, \lanes
.else
	ld1r		{ROUND_KEY.\lanes}, [x6], #( \n / 8 )
	_speck_round_128bytes	\n, \lanes
.endif
	subs		w7, w7, #1
	bne		.Lnext_round_\@

	// Re-interleave the 'x' and 'y' elements of each block
	zip1		TMP0.\lanes, Y_0.\lanes, X_0.\lanes
	zip2		Y_0.\lanes,  Y_0.\lanes, X_0.\lanes
	zip1		TMP1.\lanes, Y_1.\lanes, X_1.\lanes
	zip2		Y_1.\lanes,  Y_1.\lanes, X_1.\lanes
	zip1		TMP2.\lanes, Y_2.\lanes, X_2.\lanes
	zip2		Y_2.\lanes,  Y_2.\lanes, X_2.\lanes
	zip1		TMP3.\lanes, Y_3.\lanes, X_3.\lanes
	zip2		Y_3.\lanes,  Y_3.\lanes, X_3.\lanes

	// XOR the encrypted/decrypted blocks with the tweaks calculated earlier
	eor		X_0.16b, TMP0.16b, TWEAKV0.16b
	eor		Y_0.16b, Y_0.16b,  TWEAKV1.16b
	eor		X_1.16b, TMP1.16b, TWEAKV2.16b
	eor		Y_1.16b, Y_1.16b,  TWEAKV3.16b
	eor		X_2.16b, TMP2.16b, TWEAKV4.16b
	eor		Y_2.16b, Y_2.16b,  TWEAKV5.16b
	eor		X_3.16b, TMP3.16b, TWEAKV6.16b
	eor		Y_3.16b, Y_3.16b,  TWEAKV7.16b
	mov		TWEAKV0.16b, TWEAKV_NEXT.16b

	// Store the ciphertext in the destination buffer
	st1		{X_0.16b-Y_1.16b}, [DST], #64
	st1		{X_2.16b-Y_3.16b}, [DST], #64

	// Continue if there are more 128-byte chunks remaining
	subs		NBYTES, NBYTES, #128
	bne		.Lnext_128bytes_\@

	// Store the next tweak and return
.if \n == 64
	st1		{TWEAKV_NEXT.16b}, [TWEAK]
.else
	st1		{TWEAKV_NEXT.8b}, [TWEAK]
.endif
	ret
.endm

ENTRY(speck128_xts_encrypt_neon)
	_speck_xts_crypt	n=64, lanes=2d, decrypting=0
ENDPROC(speck128_xts_encrypt_neon)

ENTRY(speck128_xts_decrypt_neon)
	_speck_xts_crypt	n=64, lanes=2d, decrypting=1
ENDPROC(speck128_xts_decrypt_neon)

ENTRY(speck64_xts_encrypt_neon)
	_speck_xts_crypt	n=32, lanes=4s, decrypting=0
ENDPROC(speck64_xts_encrypt_neon)

ENTRY(speck64_xts_decrypt_neon)
	_speck_xts_crypt	n=32, lanes=4s, decrypting=1
ENDPROC(speck64_xts_decrypt_neon)