x86/crypto/aes-gcm-avx10-x86_64.S

1 /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
3 // VAES and VPCLMULQDQ optimized AES-GCM for x86_64
9 //------------------------------------------------------------------------------
11 // This file is dual-licensed, meaning that you can use it under your choice of
17 //	http://www.apache.org/licenses/LICENSE-2.0
49 //------------------------------------------------------------------------------
51 // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
54 // decryption update functions which are the most performance-critical, are
55 // provided in two variants generated from a macro: one using 256-bit vectors
56 // (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512).  The
57 // other, "shared" functions (vaes_avx10) use at most 256-bit vectors.
59 // The functions that use 512-bit vectors are intended for CPUs that support
60 // 512-bit vectors *and* where using them doesn't cause significant
76 // Note that the macros that support both 256-bit and 512-bit vectors could
77 // fairly easily be changed to support 128-bit too.  However, this would *not*
82 // three-argument XOR).  These features are very useful for AES-GCM.
89 	// A shuffle mask that reflects the bytes of 16-byte blocks
97 	// Alternatively, it can be interpreted as the naturally-ordered
107 	// The below constants are used for incrementing the counter blocks.
108 	// ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
109 	// inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
132 // This is immediately followed by three zeroized padding blocks, which are
134 // and two blocks remain, we load the 4 values [H^2, H^1, 0, 0].  The most
135 // padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
141 // register aliases V0-V31 that map to the ymm or zmm registers.
157 // 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
169 // carryless multiplication of two 128-bit input polynomials to get a 256-bit
180 // Instead, we operate on the values without bit-reflecting them.  This *mostly*
184 // give the polynomial terms a consistent order.  E.g., considering an N-bit
186 // through N-1 of the byte-reflected value represent the coefficients of x^(N-1)
187 // through x^0, whereas bits 0 through N-1 of the non-byte-reflected value
188 // represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked
193 // multiplication.  This is because an M-bit by N-bit carryless multiplication
194 // really produces a (M+N-1)-bit product, but in practice it's zero-extended to
196 // to polynomial coefficients backwards, this zero-extension actually changes
198 // macro must ensure that one of the inputs has an extra factor of x^-1, i.e.
205 // in the natural order, and the multiplication is actually \a * \b * x^-128 mod
212 // 128-bit carryless multiplication, so we break the 128 x 128 multiplication
219 // The 256-bit product is x^128*HI + x^64*MI + LO.  LO, MI, and HI are 128-bit.
226 // where A and B are 128-bit.  Adding B_L*G to that value gives:
235 // + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the
241 // The needed three-argument XORs are done using the vpternlogd instruction with
244 // A potential optimization, assuming that b is fixed per-key (if a is fixed
245 // per-key it would work the other way around), is to use one iteration of the
255 // because it would require twice as many per-key precomputed values.
284 // GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
292 // GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
304 // Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
317 // Given the expanded AES key |key->aes_key|, this function derives the GHASH
318 // subkey and initializes |key->ghash_key_powers| with powers of it.
321 // the order H^NUM_H_POWERS to H^1.  The zeroized padding blocks after the key
333 	// Additional local variables.  V0-V2 and %rax are used as temporaries.
347 	lea		OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR
349 	// Encrypt an all-zeroes block to get the raw hash subkey.
352 	vmovdqu		(KEY), %xmm0  // Zero-th round key XOR all-zeroes block
364 	// Zeroize the padding blocks.
373 	// by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards
380 	// is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2
381 	// << 120) | 1 if a 1 bit was carried out.  However, there's no 128-bit
382 	// wide shift instruction, so instead double each of the two 64-bit
396 	// factor of x^-1 (or x using the natural interpretation).  Nothing
398 	// end up with two factors of x^-1, but the multiplication consumes one.
399 	// So the product H^2 ends up with the desired one factor of x^-1.
422 	mov		$(NUM_H_POWERS*16/VL) - 1, %eax
424 	sub		$VL, POWERS_PTR
434 // XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
450 // Do one step of the GHASH update of the data blocks given in the vector
451 // registers GHASHDATA[0-3].  \i specifies the step to do, 0 through 9.  The
454 // GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered;
455 // H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and
456 // GHASHTMP[0-2] as temporaries.  This macro handles the byte-reflection of the
457 // data blocks.  The parameter registers must be preserved across steps.
461 // operations are vectorized operations on vectors of 16-byte blocks.  E.g.,
462 // with VL=32 there are 2 blocks per vector and the vectorized terms correspond
463 // to the following non-vectorized terms:
470 // With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
473 //   - Do vectorized "schoolbook" multiplications to compute the intermediate
474 //     256-bit product of each block and its corresponding hash key power.
476 //   - Sum (XOR) the intermediate 256-bit products across vectors.  This leaves
477 //     VL/16 256-bit intermediate values.
478 //   - Do a vectorized reduction of these 256-bit intermediate values to
479 //     128-bits each.  This leaves VL/16 128-bit intermediate values.
480 //   - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
535 // Do one non-last round of AES encryption on the counter blocks in V0-V3 using
536 // the round key that has been broadcast to all 128-bit lanes of \round_key.
544 // Start the AES encryption of four vectors of counter blocks.
547 	// Increment LE_CTR four times to generate four vectors of little-endian
548 	// counter blocks, swap each to big-endian, and store them in V0-V3.
558 	// AES "round zero": XOR in the zero-th round key.
580 // in-place and out-of-place en/decryption are supported.
582 // |le_ctr| must give the current counter in little-endian format.  For a new
587 // 32-bit word of the counter is incremented, following the GCM standard.
597 	.set	DATALEN64,	%r9	// Zero-extend DATALEN before using!
611 	// In the main loop, V0-V3 are used as AES input and output.  Elsewhere
614 	// GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
623 	// BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
624 	// using vpshufb, copied to all 128-bit lanes.
631 	// only the lowest 128-bit lane can be nonzero.  When not fully reduced,
636 	// LE_CTR_INC is the vector of 32-bit words that need to be added to a
637 	// vector of little-endian counter blocks to advance it forwards.
640 	// LE_CTR contains the next set of little-endian counter blocks.
643 	// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys,
644 	// copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
645 	// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
654 	// RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with
663 	// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
670 	// H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1.  The
677 	// GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
692 	// round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
693 	// respectively.  Then load the zero-th and last round keys.
701 	// Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
713 	// Pre-subtracting 4*VL from DATALEN saves an instruction from the main
715 	// DATALEN, zero-extending it and allowing DATALEN64 to be used later.
716 	sub		$4*VL, DATALEN
720 	vmovdqu8	OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4
721 	vmovdqu8	OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3
722 	vmovdqu8	OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2
723 	vmovdqu8	OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1
727 	// When possible, interleave the AES encryption of the counter blocks
728 	// with the GHASH update of the ciphertext blocks.  This improves
735 	// decryption the ciphertext blocks are immediately available.  For
736 	// encryption, instead encrypt the first set of blocks, then hash those
737 	// blocks while encrypting the next set of blocks, repeat that as
738 	// needed, and finally hash the last set of blocks.
741 	// Encrypt the first 4 vectors of plaintext blocks.  Leave the resulting
742 	// ciphertext in GHASHDATA[0-3] for GHASH.
765 	sub		$4*VL, DATALEN
771 	vbroadcasti32x4	-\i*16(RNDKEYLAST_PTR), RNDKEY_M\i
776 	// If decrypting, load more ciphertext blocks into GHASHDATA[0-3].  If
777 	// encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
785 	// Start the AES encryption of the counter blocks.
788 	jl		128f	// AES-128?
789 	je		192f	// AES-192?
790 	// AES-256
791 	vbroadcasti32x4	-13*16(RNDKEYLAST_PTR), RNDKEY
793 	vbroadcasti32x4	-12*16(RNDKEYLAST_PTR), RNDKEY
796 	vbroadcasti32x4	-11*16(RNDKEYLAST_PTR), RNDKEY
798 	vbroadcasti32x4	-10*16(RNDKEYLAST_PTR), RNDKEY
803 	// RNDKEYLAST[0-3].  This reduces latency by taking advantage of the
817 	// Finish the AES encryption of the counter blocks in V0-V3, interleaved
818 	// with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
821 	_ghash_step_4x	(9 - \i)
824 	vbroadcasti32x4	-\i*16(RNDKEYLAST_PTR), RNDKEY
826 	_ghash_step_4x	(9 - \i)
845 	sub		$4*VL, DATALEN
850 	// Update GHASH with the last set of ciphertext blocks.
879 	// Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
880 	// is the number of blocks that remain.
884 	and		$~15, %rax  // -round_up(DATALEN, 16)
904 	mov		$-1, %eax
908 	mov		$-1, %rax
913 	// Encrypt a vector of counter blocks.  This does not need to be masked.
933 	// In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
935 	// it's done by the below masked register-to-register move.)  Note that
936 	// if DATALEN <= VL - 16, there will be additional padding beyond the
940 	// GHASH.  However, any such blocks are all-zeroes, and the values that
941 	// they're multiplied with are also all-zeroes.  Therefore they just add
955 	sub		$VL, DATALEN
985 // The encryption function then stores the full-length (16-byte) computed
987 // expected authentication tag (the one that was transmitted) from the 16-byte
1002 	// %rax, %xmm0-%xmm3, and %k1 are used as temporary registers.
1018 	// Set up a counter block with 1 in the low 32-bit word.  This is the
1025 	// the en/decrypted data length, both in big-endian byte order, a byte
1027 	// GHASH (see _ghash_mul_step).  By using little-endian values in the
1035 	vmovdqu8	OFFSETOFEND_H_POWERS-16(KEY), H_POW1
1040 	mov		$-1, %eax
1049 	// block to big-endian and XOR-ing it with the zero-th AES round key.
1056 	jl		128f	// AES-128?
1057 	je		192f	// AES-192?
1058 	// AES-256
1059 	vaesenc		-13*16(%rax), %xmm0, %xmm0
1060 	vaesenc		-12*16(%rax), %xmm0, %xmm0
1062 	vaesenc		-11*16(%rax), %xmm0, %xmm0
1063 	vaesenc		-10*16(%rax), %xmm0, %xmm0
1068 	vaesenc		(\i-9)*16(%rax), %xmm0, %xmm0
1086 	// whether the result is all-zeroes.  This should be constant-time.
1135 // data given by |aad| and |aadlen|.  |key->ghash_key_powers| must have been
1140 // AES-GCM is almost always used with small amounts of AAD, less than 32 bytes.
1142 // which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop.  This
1143 // keeps the code size down, and it enables some micro-optimizations, e.g. using
1144 // VEX-coded instructions instead of EVEX-coded to save some instruction bytes.
1145 // To optimize for large amounts of AAD, we could implement a 4x-wide loop and
1146 // provide a version using 512-bit vectors, but that doesn't seem to be useful.
1154 	.set	AADLEN64,	%rcx	// Zero-extend AADLEN before using!
1157 	// %rax, %ymm0-%ymm3, and %k1 are used as temporary registers.
1173 	// Pre-subtracting 32 from AADLEN saves an instruction from the loop and
1175 	// zero-extending it and allowing AADLEN64 to be used later.
1176 	sub		$32, AADLEN
1178 	vmovdqu8	OFFSETOFEND_H_POWERS-32(KEY), H_POW1	// [H^2, H^1]
1188 	sub		$32, AADLEN
1195 	mov		$-1, %eax
1200 	and		$~15, AADLEN64  // -round_up(AADLEN, 16)