arm/crypto/blake2b-neon-core.S

1 /* SPDX-License-Identifier: GPL-2.0-or-later */
29 	// M_0-M_3 are occasionally used for other purposes too.
50 	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
51 	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
53 	.byte		3, 4, 5, 6, 7, 0, 1, 2
55 	.byte		2, 3, 4, 5, 6, 7, 0, 1
58 	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
59 	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
60 	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
61 	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
63 // Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
64 // NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
65 // pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
66 // (M_0-M_3), so that they can be reloaded if they are used as temporary
67 // registers.  The macro arguments s0-s15 give the order in which the message
70 			s8, s9, s10, s11, s12, s13, s14, s15, final=0
73 	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
76 	// a += b + m[blake2b_sigma[r][2*i + 0]];
84 	// d = ror64(d ^ a, 32);
90 	// c += d;
103 	// a += b + m[blake2b_sigma[r][2*i + 1]];
108 .if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
118 	// d = ror64(d ^ a, 16);
127 	// c += d;
133 	// This rotation amount isn't a multiple of 8, so it has to be
134 	// implemented using a pair of shifts, which requires temporary
135 	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
142 	vld1.8		{q8-q9}, [sp, :256]
145 	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
146 	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
150 	// and undo it afterwards; or just use 64-bit operations on 'd'
151 	// registers instead of 128-bit operations on 'q' registers.  We use the
152 	// latter approach, as it performs much better on Cortex-A7.
154 	// a += b + m[blake2b_sigma[r][2*i + 0]];
164 	// d = ror64(d ^ a, 32);
174 	// c += d;
191 	// a += b + m[blake2b_sigma[r][2*i + 1]];
192 .if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
204 	// d = ror64(d ^ a, 16);
215 	// c += d;
230 	// Reloading q8-q9 can be skipped on the final round.
232 	vld1.8		{q8-q9}, [sp, :256]
247 	push		{r4-r10}
249 	// Allocate a 32-byte stack buffer that is 32-byte aligned.
259 	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
260 	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
263 	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
264 	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
265 	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
266 	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
269 	vmov.i32	d28[0], r7
270 	vst1.64		{d28}, [ip]		// Update t[0]
275 	// entire state matrix in q0-q7 and the entire message block in q8-15.
280 	// Therefore we store a copy of the first 32 bytes of the message block
281 	// (q8-q9) in an aligned buffer on the stack so that they can be
284 	vld1.8		{q8-q9}, [BLOCK]!
285 	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
286 	vld1.8		{q10-q11}, [BLOCK]!
287 	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
288 	vld1.8		{q12-q13}, [BLOCK]!
289 	vst1.8		{q8-q9}, [sp, :256]
291 	vld1.8		{q14-q15}, [BLOCK]!
295 	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
296 	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
297 	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
298 	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
299 	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
300 	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
301 	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
302 	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
303 	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
304 	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
305 	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
306 	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
311 	//	for (i = 0; i < 8; i++)
314 	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
315 	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
317 	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
320 	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
323 	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
324 	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
327 	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]
330 	bne		.Lnext_block		// nblocks != 0?
333 	pop		{r4-r10}
338 	// carrying the overflow bit into the full 128-bit counter.
340 	adcs		r8, r8, #0
341 	adcs		r9, r9, #0
342 	adc		r10, r10, #0
345 	vst1.64		{q14}, [ip]		// Update t[0] and t[1]