Lines Matching +full:sub +full:- +full:blocks
1 /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
3 // VAES and VPCLMULQDQ optimized AES-GCM for x86_64
9 //------------------------------------------------------------------------------
11 // This file is dual-licensed, meaning that you can use it under your choice of
17 // http://www.apache.org/licenses/LICENSE-2.0
49 //------------------------------------------------------------------------------
51 // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
54 // decryption update functions which are the most performance-critical, are
55 // provided in two variants generated from a macro: one using 256-bit vectors
56 // (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The
57 // other, "shared" functions (vaes_avx10) use at most 256-bit vectors.
59 // The functions that use 512-bit vectors are intended for CPUs that support
60 // 512-bit vectors *and* where using them doesn't cause significant
76 // Note that the macros that support both 256-bit and 512-bit vectors could
77 // fairly easily be changed to support 128-bit too. However, this would *not*
82 // three-argument XOR). These features are very useful for AES-GCM.
89 // A shuffle mask that reflects the bytes of 16-byte blocks
97 // Alternatively, it can be interpreted as the naturally-ordered
107 // The below constants are used for incrementing the counter blocks.
108 // ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
109 // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
132 // This is immediately followed by three zeroized padding blocks, which are
134 // and two blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most
135 // padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
141 // register aliases V0-V31 that map to the ymm or zmm registers.
157 // 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
169 // carryless multiplication of two 128-bit input polynomials to get a 256-bit
180 // Instead, we operate on the values without bit-reflecting them. This *mostly*
184 // give the polynomial terms a consistent order. E.g., considering an N-bit
186 // through N-1 of the byte-reflected value represent the coefficients of x^(N-1)
187 // through x^0, whereas bits 0 through N-1 of the non-byte-reflected value
188 // represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked
193 // multiplication. This is because an M-bit by N-bit carryless multiplication
194 // really produces a (M+N-1)-bit product, but in practice it's zero-extended to
196 // to polynomial coefficients backwards, this zero-extension actually changes
198 // macro must ensure that one of the inputs has an extra factor of x^-1, i.e.
205 // in the natural order, and the multiplication is actually \a * \b * x^-128 mod
212 // 128-bit carryless multiplication, so we break the 128 x 128 multiplication
219 // The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit.
226 // where A and B are 128-bit. Adding B_L*G to that value gives:
235 // + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the
241 // The needed three-argument XORs are done using the vpternlogd instruction with
244 // A potential optimization, assuming that b is fixed per-key (if a is fixed
245 // per-key it would work the other way around), is to use one iteration of the
255 // because it would require twice as many per-key precomputed values.
284 // GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
292 // GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
304 // Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
317 // Given the expanded AES key |key->aes_key|, this function derives the GHASH
318 // subkey and initializes |key->ghash_key_powers| with powers of it.
321 // the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key
333 // Additional local variables. V0-V2 and %rax are used as temporaries.
347 lea OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR
349 // Encrypt an all-zeroes block to get the raw hash subkey.
352 vmovdqu (KEY), %xmm0 // Zero-th round key XOR all-zeroes block
364 // Zeroize the padding blocks.
373 // by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards
380 // is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2
381 // << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit
382 // wide shift instruction, so instead double each of the two 64-bit
396 // factor of x^-1 (or x using the natural interpretation). Nothing
398 // end up with two factors of x^-1, but the multiplication consumes one.
399 // So the product H^2 ends up with the desired one factor of x^-1.
422 mov $(NUM_H_POWERS*16/VL) - 1, %eax
424 sub $VL, POWERS_PTR
434 // XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
450 // Do one step of the GHASH update of the data blocks given in the vector
451 // registers GHASHDATA[0-3]. \i specifies the step to do, 0 through 9. The
454 // GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered;
455 // H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and
456 // GHASHTMP[0-2] as temporaries. This macro handles the byte-reflection of the
457 // data blocks. The parameter registers must be preserved across steps.
461 // operations are vectorized operations on vectors of 16-byte blocks. E.g.,
462 // with VL=32 there are 2 blocks per vector and the vectorized terms correspond
463 // to the following non-vectorized terms:
470 // With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
473 // - Do vectorized "schoolbook" multiplications to compute the intermediate
474 // 256-bit product of each block and its corresponding hash key power.
476 // - Sum (XOR) the intermediate 256-bit products across vectors. This leaves
477 // VL/16 256-bit intermediate values.
478 // - Do a vectorized reduction of these 256-bit intermediate values to
479 // 128-bits each. This leaves VL/16 128-bit intermediate values.
480 // - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
535 // Do one non-last round of AES encryption on the counter blocks in V0-V3 using
536 // the round key that has been broadcast to all 128-bit lanes of \round_key.
544 // Start the AES encryption of four vectors of counter blocks.
547 // Increment LE_CTR four times to generate four vectors of little-endian
548 // counter blocks, swap each to big-endian, and store them in V0-V3.
558 // AES "round zero": XOR in the zero-th round key.
580 // in-place and out-of-place en/decryption are supported.
582 // |le_ctr| must give the current counter in little-endian format. For a new
587 // 32-bit word of the counter is incremented, following the GCM standard.
597 .set DATALEN64, %r9 // Zero-extend DATALEN before using!
611 // In the main loop, V0-V3 are used as AES input and output. Elsewhere
614 // GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
623 // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
624 // using vpshufb, copied to all 128-bit lanes.
631 // only the lowest 128-bit lane can be nonzero. When not fully reduced,
636 // LE_CTR_INC is the vector of 32-bit words that need to be added to a
637 // vector of little-endian counter blocks to advance it forwards.
640 // LE_CTR contains the next set of little-endian counter blocks.
643 // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys,
644 // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key,
645 // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
654 // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with
663 // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These
670 // H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The
677 // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
692 // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
693 // respectively. Then load the zero-th and last round keys.
701 // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
713 // Pre-subtracting 4*VL from DATALEN saves an instruction from the main
715 // DATALEN, zero-extending it and allowing DATALEN64 to be used later.
716 sub $4*VL, DATALEN
720 vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4
721 vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3
722 vmovdqu8 OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2
723 vmovdqu8 OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1
727 // When possible, interleave the AES encryption of the counter blocks
728 // with the GHASH update of the ciphertext blocks. This improves
735 // decryption the ciphertext blocks are immediately available. For
736 // encryption, instead encrypt the first set of blocks, then hash those
737 // blocks while encrypting the next set of blocks, repeat that as
738 // needed, and finally hash the last set of blocks.
741 // Encrypt the first 4 vectors of plaintext blocks. Leave the resulting
742 // ciphertext in GHASHDATA[0-3] for GHASH.
765 sub $4*VL, DATALEN
771 vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i
776 // If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If
777 // encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
785 // Start the AES encryption of the counter blocks.
788 jl 128f // AES-128?
789 je 192f // AES-192?
790 // AES-256
791 vbroadcasti32x4 -13*16(RNDKEYLAST_PTR), RNDKEY
793 vbroadcasti32x4 -12*16(RNDKEYLAST_PTR), RNDKEY
796 vbroadcasti32x4 -11*16(RNDKEYLAST_PTR), RNDKEY
798 vbroadcasti32x4 -10*16(RNDKEYLAST_PTR), RNDKEY
803 // RNDKEYLAST[0-3]. This reduces latency by taking advantage of the
817 // Finish the AES encryption of the counter blocks in V0-V3, interleaved
818 // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
821 _ghash_step_4x (9 - \i)
824 vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY
826 _ghash_step_4x (9 - \i)
845 sub $4*VL, DATALEN
850 // Update GHASH with the last set of ciphertext blocks.
879 // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
880 // is the number of blocks that remain.
884 and $~15, %rax // -round_up(DATALEN, 16)
904 mov $-1, %eax
908 mov $-1, %rax
913 // Encrypt a vector of counter blocks. This does not need to be masked.
933 // In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
935 // it's done by the below masked register-to-register move.) Note that
936 // if DATALEN <= VL - 16, there will be additional padding beyond the
940 // GHASH. However, any such blocks are all-zeroes, and the values that
941 // they're multiplied with are also all-zeroes. Therefore they just add
955 sub $VL, DATALEN
985 // The encryption function then stores the full-length (16-byte) computed
987 // expected authentication tag (the one that was transmitted) from the 16-byte
1002 // %rax, %xmm0-%xmm3, and %k1 are used as temporary registers.
1018 // Set up a counter block with 1 in the low 32-bit word. This is the
1025 // the en/decrypted data length, both in big-endian byte order, a byte
1027 // GHASH (see _ghash_mul_step). By using little-endian values in the
1035 vmovdqu8 OFFSETOFEND_H_POWERS-16(KEY), H_POW1
1040 mov $-1, %eax
1049 // block to big-endian and XOR-ing it with the zero-th AES round key.
1056 jl 128f // AES-128?
1057 je 192f // AES-192?
1058 // AES-256
1059 vaesenc -13*16(%rax), %xmm0, %xmm0
1060 vaesenc -12*16(%rax), %xmm0, %xmm0
1062 vaesenc -11*16(%rax), %xmm0, %xmm0
1063 vaesenc -10*16(%rax), %xmm0, %xmm0
1068 vaesenc (\i-9)*16(%rax), %xmm0, %xmm0
1086 // whether the result is all-zeroes. This should be constant-time.
1135 // data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been
1140 // AES-GCM is almost always used with small amounts of AAD, less than 32 bytes.
1142 // which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This
1143 // keeps the code size down, and it enables some micro-optimizations, e.g. using
1144 // VEX-coded instructions instead of EVEX-coded to save some instruction bytes.
1145 // To optimize for large amounts of AAD, we could implement a 4x-wide loop and
1146 // provide a version using 512-bit vectors, but that doesn't seem to be useful.
1154 .set AADLEN64, %rcx // Zero-extend AADLEN before using!
1157 // %rax, %ymm0-%ymm3, and %k1 are used as temporary registers.
1173 // Pre-subtracting 32 from AADLEN saves an instruction from the loop and
1175 // zero-extending it and allowing AADLEN64 to be used later.
1176 sub $32, AADLEN
1178 vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1]
1188 sub $32, AADLEN
1195 mov $-1, %eax
1200 and $~15, AADLEN64 // -round_up(AADLEN, 16)