Lines Matching full:w

166 	# Extract w[t-7]
167 MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
168 # Calculate w[t-16] + w[t-7]
169 vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
170 # Extract w[t-15]
171 MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
175 # Calculate w[t-15] ror 1
178 vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
179 # Calculate w[t-15] shr 7
180 vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
185 add frame_XFER(%rsp),h # h = k + w + h # --
197 add h, d # d = k + w + h + d # --
210 add y1, h # h = k + w + h + S0 # --
212 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
214 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
221 # Calculate w[t-15] ror 8
224 vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
226 vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
230 # Add three components, w[t-16], w[t-7] and sigma0
231 vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
232 # Move to appropriate lanes for calculating w[16] and w[17]
233 vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
234 # Move to appropriate lanes for calculating w[18] and w[19]
235 vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
237 # Calculate w[16] and w[17] in both 128 bit lanes
239 # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
240 vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
241 vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
247 add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
261 add h, d # d = k + w + h + d # --
275 add y1, h # h = k + w + h + S0 # --
277 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
278 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
286 vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
287 vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
288 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
289 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
290 vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
291 vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
292 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
293 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
294 # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
296 # Add sigma1 to the other compunents to get w[16] and w[17]
297 vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
299 # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
300 vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
304 add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
316 add h, d # d = k + w + h + d # --
332 add y1, h # h = k + w + h + S0 # --
333 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
334 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
342 vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
343 vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
344 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
345 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
346 vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
347 vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
348 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
349 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
350 # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
352 # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
353 # to newly calculated sigma1 to get w[18] and w[19]
354 vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
356 # Form w[19, w[18], w17], w[16]
357 vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
362 add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
374 add h, d # d = k + w + h + d # --
384 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
393 add y1, h # h = k + w + h + S0 # --
394 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
423 add frame_XFER(%rsp), h # h = k + w + h # --
432 add h, d # d = k + w + h + d # --
434 add y1, h # h = k + w + h + S0 # --
436 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
442 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
461 add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
470 add h, d # d = k + w + h + d # --
472 add y1, h # h = k + w + h + S0 # --
474 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
480 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
499 add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
508 add h, d # d = k + w + h + d # --
510 add y1, h # h = k + w + h + S0 # --
512 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
518 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
537 add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
547 add h, d # d = k + w + h + d # --
549 add y1, h # h = k + w + h + S0 # --
551 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
553 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --