Lines Matching full:w
62 /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
250 * RR does two rounds of SHA-1 back to back with W[] pre-calc
251 * t1 = F(b, c, d); e += w(i)
252 * e += t1; b <<= 30; d += w(i+1);
312 .set W, W0 define
320 .set W_minus_32, W
331 .set W_minus_04, W
332 .set W, W_minus_32 define
353 movdqa W_TMP1, W
364 * - calculating last 32 w[i] values in 8 XMM registers
365 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
368 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
375 movdqa W_minus_12, W
376 palignr $8, W_minus_16, W # w[i-14]
378 psrldq $4, W_TMP1 # w[i-3]
379 pxor W_minus_08, W
382 pxor W_TMP1, W
383 movdqa W, W_TMP2
384 movdqa W, W_TMP1
387 psrld $31, W
389 por W, W_TMP1
390 movdqa W_TMP2, W
392 pslld $2, W
394 pxor W, W_TMP1
396 movdqa W_TMP1, W
405 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
406 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
407 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
412 pxor W_minus_28, W # W is W_minus_32 before xor
415 pxor W_minus_16, W
416 pxor W_TMP1, W
417 movdqa W, W_TMP1
419 psrld $30, W
421 por W, W_TMP1
423 movdqa W_TMP1, W
490 vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
492 vpaddd (K_BASE), W, W_TMP1
501 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
502 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
503 vpxor W_minus_08, W, W
506 vpxor W_TMP1, W, W
507 vpslldq $12, W, W_TMP2
508 vpslld $1, W, W_TMP1
510 vpsrld $31, W, W
511 vpor W, W_TMP1, W_TMP1
512 vpslld $2, W_TMP2, W
515 vpxor W, W_TMP1, W_TMP1
516 vpxor W_TMP2, W_TMP1, W
517 vpaddd K_XMM(K_BASE), W, W_TMP1
526 vpxor W_minus_28, W, W # W is W_minus_32 before xor
529 vpxor W_TMP1, W, W
531 vpslld $2, W, W_TMP1
532 vpsrld $30, W, W
533 vpor W, W_TMP1, W
535 vpaddd K_XMM(K_BASE), W, W_TMP1