x86/crypto/crct10dif-pcl-asm_64.S

2 # Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
50 #  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
68 	movdqu	\offset+16(buf), %xmm12
95 # Assumes len >= 16.
107 	movdqu	16*0(buf), %xmm0
108 	movdqu	16*1(buf), %xmm1
109 	movdqu	16*2(buf), %xmm2
110 	movdqu	16*3(buf), %xmm3
111 	movdqu	16*4(buf), %xmm4
112 	movdqu	16*5(buf), %xmm5
113 	movdqu	16*6(buf), %xmm6
114 	movdqu	16*7(buf), %xmm7
125 	# XOR the first 16 data *bits* with the initial CRC value.
136 	# While >= 128 data bytes remain (not counting xmm0-7), fold the 128
137 	# bytes xmm0-7 into them, storing the result back into xmm0-7.
147 	# Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
159 	# Fold across 16 bytes.
165 	# Then subtract 16 to simplify the termination condition of the
167 	add	$128-16, len
169 	# While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes
180 	add	$16, buf
181 	sub	$16, len
185 	# Add 16 to get the correct number of data bytes remaining in 0...15
186 	# (not counting xmm7), following the previous extra subtraction by 16.
187 	add	$16, len
191 	# Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
194 	# the bytes into a first chunk of 'len' bytes and a second chunk of 16
199 	# xmm1 = last 16 original data bytes
200 	movdqu	-16(buf, len), %xmm1
203 	# xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
204 	lea	.Lbyteshift_table+16(%rip), %rax
209 	# xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
213 	# xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
214 	# then '16-len' bytes from xmm2 (high-order bytes).
225 	# Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
230 	# Fold the high 64 bits into the low 64 bits, while also multiplying by
231 	# x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
232 	# whose low 48 bits are 0.
234 	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
236 	pxor	%xmm0, %xmm7			  # + low bits * x^64
238 	# Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
239 	# value congruent to x^64 * M(x) and whose low 48 bits are 0.
241 	pand	.Lmask2(%rip), %xmm0		  # zero high 32 bits
242 	psrldq	$12, %xmm7			  # extract high 32 bits
243 	pclmulqdq	$0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
244 	pxor	%xmm0, %xmm7			  # + low bits
251 	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
255 	pxor	%xmm7, %xmm0		     # + low 16 nonzero bits
256 	# Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
261 .align 16
263 	# Checksumming a buffer of length 16...255 bytes
265 	# Load the first 16 data bytes.
268 	add	$16, buf
270 	# XOR the first 16 data *bits* with the initial CRC value.
276 	cmp	$16, len
277 	je	.Lreduce_final_16_bytes		# len == 16
280 	add	$16, len
285 .align 16
288 # G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
308 .section	.rodata.cst16.mask1, "aM", @progbits, 16
309 .align 16
313 .section	.rodata.cst16.mask2, "aM", @progbits, 16
314 .align 16
318 .section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
319 .align 16
324 .align 16
325 # For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
327 # 0x80} XOR the index vector to shift right by '16 - len' bytes.