Lines Matching +full:8 +full:a
14 // Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
39 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
81 // do an 8-register wide loop. Considering that and the fact that we have
96 // As a rough approximation, we can assume that Karatsuba multiplication is
98 // 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit
114 // An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
115 // saved by using a multiplication-less reduction method. We don't do that
116 // because it would require a large number of shift and xor instructions,
119 // It does make sense to sometimes use a different reduction optimization
120 // that saves a pclmulqdq, though: precompute the hash key times x^64, and
124 // multi-block processing we use Karatsuba multiplication with a regular
139 // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
153 // Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback
164 // Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes
175 // Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that
187 // be a temporary xmm register.
198 // must be a temporary xmm register.
211 sub $8, %ecx // LEN - 8
215 movq (\src), \dst // Load first 8 bytes
216 mov (\src, %rcx), %rax // Load last 8 bytes
227 // Load 4 <= LEN <= 8 bytes.
250 sub $8, %ecx // LEN - 8
253 // Store 8 <= LEN <= 15 bytes.
258 mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes
259 movq \src, (\dst) // Store first 8 bytes
286 // Do one step of GHASH-multiplying \a by \b and storing the reduced product in
288 // \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
290 .macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1
292 // MI = (a_L * b_H) + ((a*x^64)_L * b_L)
294 _vpclmulqdq $0x01, \a, \b, \t0
300 // HI = (a_H * b_H) + ((a*x^64)_H * b_L)
302 _vpclmulqdq $0x11, \a, \b, \t1
313 .elseif \i == 8
320 // GHASH-multiply \a by \b and store the reduced product in \b.
322 .macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1
323 .irp i, 0,1,2,3,4,5,6,7,8,9
324 _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
328 // GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
331 // two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered.
332 .macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0
335 _vpclmulqdq $0x00, \a, \b, \t0
343 pclmulqdq $0x11, \a, \b
374 // Do the first step of the GHASH update of a set of 8 ciphertext blocks.
378 // GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
382 // (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
383 // registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the
384 // inner block counter in %rax, which is a value that counts up by 8 for each
385 // block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
400 // Load the highest hash key power, H^8.
412 // 'b' that needs to be multiplied with the hash key power 'a'.
429 // Continue the GHASH update of 8 ciphertext blocks as described above by doing
431 // key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
433 add $8, %eax
464 // Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to
543 movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)
545 // Compute and store the remaining key powers H^2 through H^8.
547 mov $6*8, %eax
557 sub $8, %eax
569 // zeroes. |aadlen| must be a multiple of 16, except on the last call where it
594 // Process the AAD one full block at a time.
606 // Check whether there is a partial block at the end.
610 // Process a partial block of length 1 <= AADLEN <= 15.
635 // Do a non-last round of AES on AESDATA[0-7] using \round_key.
664 // This macro generates a GCM encryption or decryption update function with the
672 // |datalen| must be a multiple of 16, except on the last call where it can be
676 // |le_ctr| must give the current counter in little-endian format. For a new
731 // If there are at least 8*16 bytes of data, then continue into the main
732 // loop, which processes 8*16 bytes of data per iteration.
737 // available. When encrypting, we instead encrypt a set of 8 blocks
738 // first and then GHASH those blocks while encrypting the next set of 8,
739 // repeat that as needed, and finally GHASH the last set of 8 blocks.
741 // Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
742 // as this makes the immediate fit in a signed byte, saving 3 bytes.
743 add $-8*16, DATALEN
746 // Encrypt the first 8 plaintext blocks.
760 sub $-8*16, SRC
761 add $-8*16, DATALEN
768 // Generate the next set of 8 counter blocks and start encrypting them.
772 // Do a round of AES, and start the GHASH update of 8 ciphertext blocks
787 cmp $7*8, %eax
807 sub $-8*16, DST
810 sub $-8*16, SRC
812 sub $-8*16, DST
814 add $-8*16, DATALEN
819 // Update GHASH with the last set of 8 ciphertext blocks.
824 cmp $7*8, %eax
828 sub $-8*16, DST
833 sub $-8*16, DATALEN
836 // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep
838 // a time, again taking advantage of hardware loop unrolling. Since
854 // Set up a block counter %rax to contain 8*(8-n), where n is the number
866 // Process the data one full block at a time.
915 add $8, %eax
921 // Check whether there is a partial block at the end.
925 // Process a partial block of length 1 <= DATALEN <= 15.
927 // Encrypt a counter block for the last time.
1007 .set TAGLEN, %r10d // Originally at 8(%rsp)
1023 // Set up a counter block with 1 in the low 32-bit word. This is the
1057 .irp i, 0,1,2,3,4,5,6,7,8
1080 movl 8(%rsp), TAGLEN