Lines Matching +full:4 +full:- +full:16
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
14 .file "cast5-avx-x86_64-asm_64.S"
23 #define kr (16*4)
24 #define rr ((16*4)+16)
26 /* s-boxes */
33 16-way AVX cast5
88 movl (RID2,RID1,4), dst ## d; \
91 op1 (RID1,RID2,4), dst ## d; \
92 shrq $16, src; \
95 op2 (RID2,RID1,4), dst ## d; \
99 op3 (RID1,RID2,4), dst ## d;
104 shrq $16, reg;
150 vbroadcastss (km+(4*n))(CTX), RKM; \
155 subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
159 /* add 16-bit rotation to key rotations (mod 32) */ \
164 /* add 16-bit rotation to key rotations (mod 32) */ \
187 .section .rodata.cst16.bswap_mask, "aM", @progbits, 16
188 .align 16
190 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
191 .section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
192 .align 16
194 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
195 .section .rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
196 .align 16
198 .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
200 .section .rodata.cst4.16_mask, "aM", @progbits, 4
201 .align 4
203 .byte 16, 16, 16, 16
204 .section .rodata.cst4.32_mask, "aM", @progbits, 4
205 .align 4
208 .section .rodata.cst4.first_mask, "aM", @progbits, 4
209 .align 4
219 * RR1: blocks 3 and 4
225 * RR4: blocks 15 and 16
228 * RR1: encrypted blocks 3 and 4
234 * RR4: encrypted blocks 15 and 16
256 round(RL, RR, 4, 2);
292 * RR1: encrypted blocks 3 and 4
298 * RR4: encrypted blocks 15 and 16
301 * RR1: decrypted blocks 3 and 4
307 * RR4: decrypted blocks 15 and 16
342 round(RR, RL, 4, 2);
360 vpsrldq $4, RKR, RKR;
376 vmovdqu (0*4*4)(%rdx), RL1;
377 vmovdqu (1*4*4)(%rdx), RR1;
378 vmovdqu (2*4*4)(%rdx), RL2;
379 vmovdqu (3*4*4)(%rdx), RR2;
380 vmovdqu (4*4*4)(%rdx), RL3;
381 vmovdqu (5*4*4)(%rdx), RR3;
382 vmovdqu (6*4*4)(%rdx), RL4;
383 vmovdqu (7*4*4)(%rdx), RR4;
387 vmovdqu RR1, (0*4*4)(%r11);
388 vmovdqu RL1, (1*4*4)(%r11);
389 vmovdqu RR2, (2*4*4)(%r11);
390 vmovdqu RL2, (3*4*4)(%r11);
391 vmovdqu RR3, (4*4*4)(%r11);
392 vmovdqu RL3, (5*4*4)(%r11);
393 vmovdqu RR4, (6*4*4)(%r11);
394 vmovdqu RL4, (7*4*4)(%r11);
414 vmovdqu (0*4*4)(%rdx), RL1;
415 vmovdqu (1*4*4)(%rdx), RR1;
416 vmovdqu (2*4*4)(%rdx), RL2;
417 vmovdqu (3*4*4)(%rdx), RR2;
418 vmovdqu (4*4*4)(%rdx), RL3;
419 vmovdqu (5*4*4)(%rdx), RR3;
420 vmovdqu (6*4*4)(%rdx), RL4;
421 vmovdqu (7*4*4)(%rdx), RR4;
425 vmovdqu RR1, (0*4*4)(%r11);
426 vmovdqu RL1, (1*4*4)(%r11);
427 vmovdqu RR2, (2*4*4)(%r11);
428 vmovdqu RL2, (3*4*4)(%r11);
429 vmovdqu RR3, (4*4*4)(%r11);
430 vmovdqu RL3, (5*4*4)(%r11);
431 vmovdqu RR4, (6*4*4)(%r11);
432 vmovdqu RL4, (7*4*4)(%r11);
453 vmovdqu (0*16)(%rdx), RL1;
454 vmovdqu (1*16)(%rdx), RR1;
455 vmovdqu (2*16)(%rdx), RL2;
456 vmovdqu (3*16)(%rdx), RR2;
457 vmovdqu (4*16)(%rdx), RL3;
458 vmovdqu (5*16)(%rdx), RR3;
459 vmovdqu (6*16)(%rdx), RL4;
460 vmovdqu (7*16)(%rdx), RR4;
468 vpxor 0*16+8(%r12), RL1, RL1;
469 vpxor 1*16+8(%r12), RR2, RR2;
470 vpxor 2*16+8(%r12), RL2, RL2;
471 vpxor 3*16+8(%r12), RR3, RR3;
472 vpxor 4*16+8(%r12), RL3, RL3;
473 vpxor 5*16+8(%r12), RR4, RR4;
474 vpxor 6*16+8(%r12), RL4, RL4;
476 vmovdqu RR1, (0*16)(%r11);
477 vmovdqu RL1, (1*16)(%r11);
478 vmovdqu RR2, (2*16)(%r11);
479 vmovdqu RL2, (3*16)(%r11);
480 vmovdqu RR3, (4*16)(%r11);
481 vmovdqu RL3, (5*16)(%r11);
482 vmovdqu RR4, (6*16)(%r11);
483 vmovdqu RL4, (7*16)(%r11);
507 vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
510 vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
544 vpxor (0*16)(%r12), RR1, RR1;
545 vpxor (1*16)(%r12), RL1, RL1;
546 vpxor (2*16)(%r12), RR2, RR2;
547 vpxor (3*16)(%r12), RL2, RL2;
548 vpxor (4*16)(%r12), RR3, RR3;
549 vpxor (5*16)(%r12), RL3, RL3;
550 vpxor (6*16)(%r12), RR4, RR4;
551 vpxor (7*16)(%r12), RL4, RL4;
552 vmovdqu RR1, (0*16)(%r11);
553 vmovdqu RL1, (1*16)(%r11);
554 vmovdqu RR2, (2*16)(%r11);
555 vmovdqu RL2, (3*16)(%r11);
556 vmovdqu RR3, (4*16)(%r11);
557 vmovdqu RL3, (5*16)(%r11);
558 vmovdqu RR4, (6*16)(%r11);
559 vmovdqu RL4, (7*16)(%r11);