arm64/crypto/aes-modes.S

1 /* SPDX-License-Identifier: GPL-2.0-only */
3  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
5  * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
8 /* included by aes-ce.S and aes-neon.S */
26 	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
31 	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
37 	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
42 	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
62 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
66 	st1		{v0.16b-v3.16b}, [x0], #64
92 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
96 	st1		{v0.16b-v3.16b}, [x0], #64
130 	mov		w8, #14				/* AES-256: 14 rounds */
143 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
144 	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
148 	eor		v2.16b, v2.16b, v1.16b
149 	encrypt_block	v2, w3, x2, x6, w7
150 	eor		v3.16b, v3.16b, v2.16b
152 	st1		{v0.16b-v3.16b}, [x0], #64
160 	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
174 	mov		w8, #14				/* AES-256: 14 rounds */
188 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
193 	mov		v7.16b, v2.16b
200 	eor		v2.16b, v2.16b, v6.16b
206 	mov		v6.16b, v2.16b
212 	eor		v2.16b, v2.16b, v5.16b
215 	st1		{v0.16b-v3.16b}, [x0], #64
225 	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
260 	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
290 	tbl		v2.16b, {v0.16b}, v3.16b
291 	eor		v2.16b, v2.16b, v1.16b
295 	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
298 	st1		{v2.16b}, [x4]			/* overlapping stores */
340 	 * easily increment the IV.  For XCTR mode this lets us efficiently XOR
341 	 * the 64-bit counter with the IV.
360 	 * Set up the counter values in v0-v{MAX_STRIDE-1}.
364 	 * v{MAX_STRIDE-1}.  For example: if encrypting two blocks with
374 	mov		v2.16b, vctr.16b
378 		sub		x6, CTR, #MAX_STRIDE - 1
379 		sub		x7, CTR, #MAX_STRIDE - 2
380 		sub		x8, CTR, #MAX_STRIDE - 3
381 		sub		x9, CTR, #MAX_STRIDE - 4
382 ST5(		sub		x10, CTR, #MAX_STRIDE - 5	)
390 		mov		v2.d[0], x8
428 		mov		v2.d[0], vctr.d[0]
438 		sub		x7, IV_PART, #MAX_STRIDE - 1
439 		sub		x8, IV_PART, #MAX_STRIDE - 2
440 		sub		x9, IV_PART, #MAX_STRIDE - 3
445 ST5(		sub		x10, IV_PART, #MAX_STRIDE - 4	)
446 		mov		v2.d[1], x8
453 	 * If there are at least MAX_STRIDE blocks left, XOR the data with
457 	ld1		{v5.16b-v7.16b}, [IN], #48
463 ST5(	ld1		{v5.16b-v6.16b}, [IN], #32	)
464 	eor		v2.16b, v7.16b, v2.16b
467 	st1		{v0.16b-v3.16b}, [OUT], #64
481 	 * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
483 	 * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
496 ST5(	cmp		BYTES_W, #64 - (MAX_STRIDE << 4))
498 	cmp		BYTES_W, #48 - (MAX_STRIDE << 4)
500 	cmp		BYTES_W, #32 - (MAX_STRIDE << 4)
502 	cmp		BYTES_W, #16 - (MAX_STRIDE << 4)
522 ST4(	eor		v8.16b, v8.16b, v2.16b		)
528 ST5(	eor		v7.16b, v7.16b, v2.16b		)
563 	ld1		{v10.16b-v11.16b}, [x9]
594 	 * to be at the end of this 16-byte temporary buffer rather than the
609 	 * to be at the end of this 16-byte temporary buffer rather than the
659 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
664 	eor		v2.16b, v2.16b, v6.16b
671 	eor		v2.16b, v2.16b, v6.16b
672 	st1		{v0.16b-v3.16b}, [x0], #64
715 	ld1		{v2.16b}, [x8]
718 	tbl		v2.16b, {v0.16b}, v2.16b
720 	st1		{v2.16b}, [x4]			/* overlapping stores */
751 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
756 	eor		v2.16b, v2.16b, v6.16b
763 	eor		v2.16b, v2.16b, v6.16b
764 	st1		{v0.16b-v3.16b}, [x0], #64
803 	ld1		{v2.16b}, [x8]
810 	tbl		v2.16b, {v0.16b}, v2.16b
813 	st1		{v2.16b}, [x4]			/* overlapping stores */
832 	ld1		{v1.16b-v4.16b}, [x0], #64	/* get next pt block */
833 	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
835 	eor		v0.16b, v0.16b, v2.16b
852 	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */