x86/crypto/aes-gcm-aesni-x86_64.S

14 // Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
39 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
81 //      do an 8-register wide loop.  Considering that and the fact that we have
96 //      As a rough approximation, we can assume that Karatsuba multiplication is
98 //      0.25 pxors are cheaper than a pclmulqdq.  (We assume that the 64-bit
114 //      An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
115 //      saved by using a multiplication-less reduction method.  We don't do that
116 //      because it would require a large number of shift and xor instructions,
119 //      It does make sense to sometimes use a different reduction optimization
120 //      that saves a pclmulqdq, though: precompute the hash key times x^64, and
124 //      multi-block processing we use Karatsuba multiplication with a regular
139 	// Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
153 // Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq.  The fallback
164 // Do a vpshufb, or fall back to a movdqa and a pshufb.  The fallback assumes
175 // Do a vpand, or fall back to a movdqu and a pand.  The fallback assumes that
187 // be a temporary xmm register.
198 // must be a temporary xmm register.
211 	sub		$8, %ecx		// LEN - 8
215 	movq		(\src), \dst		// Load first 8 bytes
216 	mov		(\src, %rcx), %rax	// Load last 8 bytes
227 	// Load 4 <= LEN <= 8 bytes.
250 	sub		$8, %ecx		// LEN - 8
253 	// Store 8 <= LEN <= 15 bytes.
258 	mov		%rax, (\dst, %rsi)	// Store last LEN - 8 bytes
259 	movq		\src, (\dst)		// Store first 8 bytes
286 // Do one step of GHASH-multiplying \a by \b and storing the reduced product in
288 // \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
290 .macro	_ghash_mul_step	i, a, a_times_x64, b, gfpoly, t0, t1
292 	// MI = (a_L * b_H) + ((a*x^64)_L * b_L)
294 	_vpclmulqdq	$0x01, \a, \b, \t0
300 	// HI = (a_H * b_H) + ((a*x^64)_H * b_L)
302 	_vpclmulqdq	$0x11, \a, \b, \t1
313 .elseif \i == 8
320 // GHASH-multiply \a by \b and store the reduced product in \b.
322 .macro	_ghash_mul	a, a_times_x64, b, gfpoly, t0, t1
323 .irp i, 0,1,2,3,4,5,6,7,8,9
324 	_ghash_mul_step	\i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
328 // GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
331 // two halves of \a XOR'd together, i.e. a_L + a_H.  \b is clobbered.
332 .macro	_ghash_mul_noreduce	a, a_xored, b, lo, mi, hi, t0
335 	_vpclmulqdq	$0x00, \a, \b, \t0
343 	pclmulqdq	$0x11, \a, \b
374 // Do the first step of the GHASH update of a set of 8 ciphertext blocks.
378 //	GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
382 // (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
383 // registers LO, MI, and GHASH_ACC a.k.a. HI.  It also zero-initializes the
384 // inner block counter in %rax, which is a value that counts up by 8 for each
385 // block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
400 	// Load the highest hash key power, H^8.
412 	// 'b' that needs to be multiplied with the hash key power 'a'.
429 // Continue the GHASH update of 8 ciphertext blocks as described above by doing
431 // key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
433 	add		$8, %eax
464 // Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC.  This is similar to
543 	movq		%xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)
545 	// Compute and store the remaining key powers H^2 through H^8.
547 	mov		$6*8, %eax
557 	sub		$8, %eax
569 // zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
594 	// Process the AAD one full block at a time.
606 	// Check whether there is a partial block at the end.
610 	// Process a partial block of length 1 <= AADLEN <= 15.
635 // Do a non-last round of AES on AESDATA[0-7] using \round_key.
664 // This macro generates a GCM encryption or decryption update function with the
672 // |datalen| must be a multiple of 16, except on the last call where it can be
676 // |le_ctr| must give the current counter in little-endian format.  For a new
731 	// If there are at least 8*16 bytes of data, then continue into the main
732 	// loop, which processes 8*16 bytes of data per iteration.
737 	// available.  When encrypting, we instead encrypt a set of 8 blocks
738 	// first and then GHASH those blocks while encrypting the next set of 8,
739 	// repeat that as needed, and finally GHASH the last set of 8 blocks.
741 	// Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
742 	// as this makes the immediate fit in a signed byte, saving 3 bytes.
743 	add		$-8*16, DATALEN
746 	// Encrypt the first 8 plaintext blocks.
760 	sub		$-8*16, SRC
761 	add		$-8*16, DATALEN
768 	// Generate the next set of 8 counter blocks and start encrypting them.
772 	// Do a round of AES, and start the GHASH update of 8 ciphertext blocks
787 	cmp		$7*8, %eax
807 	sub		$-8*16, DST
810 	sub		$-8*16, SRC
812 	sub		$-8*16, DST
814 	add		$-8*16, DATALEN
819 	// Update GHASH with the last set of 8 ciphertext blocks.
824 	cmp		$7*8, %eax
828 	sub		$-8*16, DST
833 	sub		$-8*16, DATALEN
836 	// Handle the remainder of length 1 <= DATALEN < 8*16 bytes.  We keep
838 	// a time, again taking advantage of hardware loop unrolling.  Since
854 	// Set up a block counter %rax to contain 8*(8-n), where n is the number
866 	// Process the data one full block at a time.
915 	add		$8, %eax
921 	// Check whether there is a partial block at the end.
925 	// Process a partial block of length 1 <= DATALEN <= 15.
927 	// Encrypt a counter block for the last time.
1007 	.set	TAGLEN,		%r10d	// Originally at 8(%rsp)
1023 	// Set up a counter block with 1 in the low 32-bit word.  This is the
1057 .irp i, 0,1,2,3,4,5,6,7,8
1080 	movl		8(%rsp), TAGLEN