x86/crypto/sha512-avx2-asm.S

166 	# Extract w[t-7]
167 	MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		# YTMP0 = W[-7]
168 	# Calculate w[t-16] + w[t-7]
169 	vpaddq		Y_0, YTMP0, YTMP0		# YTMP0 = W[-7] + W[-16]
170 	# Extract w[t-15]
171 	MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		# YTMP1 = W[-15]
175 	# Calculate w[t-15] ror 1
178 	vpor		YTMP2, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1
179 	# Calculate w[t-15] shr 7
180 	vpsrlq		$7, YTMP1, YTMP4		# YTMP4 = W[-15] >> 7
185 	add	frame_XFER(%rsp),h		# h = k + w + h         # --
197 	add	h, d		# d = k + w + h + d                     # --
210 	add	y1, h		# h = k + w + h + S0                    # --
212 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
214 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
221 	# Calculate w[t-15] ror 8
224 	vpor		YTMP2, YTMP1, YTMP1		# YTMP1 = W[-15] ror 8
226 	vpxor		YTMP4, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
230 	# Add three components, w[t-16], w[t-7] and sigma0
231 	vpaddq		YTMP1, YTMP0, YTMP0		# YTMP0 = W[-16] + W[-7] + s0
232 	# Move to appropriate lanes for calculating w[16] and w[17]
233 	vperm2f128	$0x0, YTMP0, YTMP0, Y_0		# Y_0 = W[-16] + W[-7] + s0 {BABA}
234 	# Move to appropriate lanes for calculating w[18] and w[19]
235 	vpand		MASK_YMM_LO(%rip), YTMP0, YTMP0	# YTMP0 = W[-16] + W[-7] + s0 {DC00}
237 	# Calculate w[16] and w[17] in both 128 bit lanes
239 	# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
240 	vperm2f128	$0x11, Y_3, Y_3, YTMP2		# YTMP2 = W[-2] {BABA}
241 	vpsrlq		$6, YTMP2, YTMP4		# YTMP4 = W[-2] >> 6 {BABA}
247 	add	1*8+frame_XFER(%rsp), h		# h = k + w + h         # --
261 	add	h, d		# d = k + w + h + d                     # --
275 	add	y1, h		# h = k + w + h + S0                    # --
277 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
278 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
286 	vpsrlq		$19, YTMP2, YTMP3		# YTMP3 = W[-2] >> 19 {BABA}
287 	vpsllq		$(64-19), YTMP2, YTMP1		# YTMP1 = W[-2] << 19 {BABA}
288 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {BABA}
289 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
290 	vpsrlq		$61, YTMP2, YTMP3		# YTMP3 = W[-2] >> 61 {BABA}
291 	vpsllq		$(64-61), YTMP2, YTMP1		# YTMP1 = W[-2] << 61 {BABA}
292 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {BABA}
293 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
294 							#  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
296 	# Add sigma1 to the other compunents to get w[16] and w[17]
297 	vpaddq		YTMP4, Y_0, Y_0			# Y_0 = {W[1], W[0], W[1], W[0]}
299 	# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
300 	vpsrlq		$6, Y_0, YTMP4			# YTMP4 = W[-2] >> 6 {DC--}
304 	add	2*8+frame_XFER(%rsp), h		# h = k + w + h         # --
316 	add	h, d		# d = k + w + h + d                     # --
332 	add	y1, h		# h = k + w + h + S0                    # --
333 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
334 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
342 	vpsrlq		$19, Y_0, YTMP3			# YTMP3 = W[-2] >> 19 {DC--}
343 	vpsllq		$(64-19), Y_0, YTMP1		# YTMP1 = W[-2] << 19 {DC--}
344 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {DC--}
345 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
346 	vpsrlq		$61, Y_0, YTMP3			# YTMP3 = W[-2] >> 61 {DC--}
347 	vpsllq		$(64-61), Y_0, YTMP1		# YTMP1 = W[-2] << 61 {DC--}
348 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {DC--}
349 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
350 							#  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
352 	# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
353 	# to newly calculated sigma1 to get w[18] and w[19]
354 	vpaddq		YTMP4, YTMP0, YTMP2		# YTMP2 = {W[3], W[2], --, --}
356 	# Form w[19, w[18], w17], w[16]
357 	vpblendd		$0xF0, YTMP2, Y_0, Y_0		# Y_0 = {W[3], W[2], W[1], W[0]}
362 	add	3*8+frame_XFER(%rsp), h		# h = k + w + h         # --
374 	add	h, d		# d = k + w + h + d                     # --
384 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
393 	add	y1, h		# h = k + w + h + S0                    # --
394 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
423 	add	frame_XFER(%rsp), h		# h = k + w + h         # --
432 	add	h, d		# d = k + w + h + d                     # --
434 	add	y1, h		# h = k + w + h + S0                    # --
436 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
442 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
461 	add	8*1+frame_XFER(%rsp), h		# h = k + w + h         # --
470 	add	h, d		# d = k + w + h + d                     # --
472 	add	y1, h		# h = k + w + h + S0                    # --
474 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
480 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
499 	add	8*2+frame_XFER(%rsp), h		# h = k + w + h         # --
508 	add	h, d		# d = k + w + h + d                     # --
510 	add	y1, h		# h = k + w + h + S0                    # --
512 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
518 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
537 	add	8*3+frame_XFER(%rsp), h		# h = k + w + h         # --
547 	add	h, d		# d = k + w + h + d                     # --
549 	add	y1, h		# h = k + w + h + S0                    # --
551 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
553 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --