Lines Matching +full:4 +full:-
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
14 .file "cast5-avx-x86_64-asm_64.S"
23 #define kr (16*4)
24 #define rr ((16*4)+16)
26 /* s-boxes */
33 16-way AVX cast5
88 movl (RID2,RID1,4), dst ## d; \
91 op1 (RID1,RID2,4), dst ## d; \
95 op2 (RID2,RID1,4), dst ## d; \
99 op3 (RID1,RID2,4), dst ## d;
150 vbroadcastss (km+(4*n))(CTX), RKM; \
155 subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
159 /* add 16-bit rotation to key rotations (mod 32) */ \
164 /* add 16-bit rotation to key rotations (mod 32) */ \
190 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
194 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
198 .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
200 .section .rodata.cst4.16_mask, "aM", @progbits, 4
201 .align 4
204 .section .rodata.cst4.32_mask, "aM", @progbits, 4
205 .align 4
208 .section .rodata.cst4.first_mask, "aM", @progbits, 4
209 .align 4
219 * RR1: blocks 3 and 4
228 * RR1: encrypted blocks 3 and 4
256 round(RL, RR, 4, 2);
292 * RR1: encrypted blocks 3 and 4
301 * RR1: decrypted blocks 3 and 4
342 round(RR, RL, 4, 2);
360 vpsrldq $4, RKR, RKR;
376 vmovdqu (0*4*4)(%rdx), RL1;
377 vmovdqu (1*4*4)(%rdx), RR1;
378 vmovdqu (2*4*4)(%rdx), RL2;
379 vmovdqu (3*4*4)(%rdx), RR2;
380 vmovdqu (4*4*4)(%rdx), RL3;
381 vmovdqu (5*4*4)(%rdx), RR3;
382 vmovdqu (6*4*4)(%rdx), RL4;
383 vmovdqu (7*4*4)(%rdx), RR4;
387 vmovdqu RR1, (0*4*4)(%r11);
388 vmovdqu RL1, (1*4*4)(%r11);
389 vmovdqu RR2, (2*4*4)(%r11);
390 vmovdqu RL2, (3*4*4)(%r11);
391 vmovdqu RR3, (4*4*4)(%r11);
392 vmovdqu RL3, (5*4*4)(%r11);
393 vmovdqu RR4, (6*4*4)(%r11);
394 vmovdqu RL4, (7*4*4)(%r11);
414 vmovdqu (0*4*4)(%rdx), RL1;
415 vmovdqu (1*4*4)(%rdx), RR1;
416 vmovdqu (2*4*4)(%rdx), RL2;
417 vmovdqu (3*4*4)(%rdx), RR2;
418 vmovdqu (4*4*4)(%rdx), RL3;
419 vmovdqu (5*4*4)(%rdx), RR3;
420 vmovdqu (6*4*4)(%rdx), RL4;
421 vmovdqu (7*4*4)(%rdx), RR4;
425 vmovdqu RR1, (0*4*4)(%r11);
426 vmovdqu RL1, (1*4*4)(%r11);
427 vmovdqu RR2, (2*4*4)(%r11);
428 vmovdqu RL2, (3*4*4)(%r11);
429 vmovdqu RR3, (4*4*4)(%r11);
430 vmovdqu RL3, (5*4*4)(%r11);
431 vmovdqu RR4, (6*4*4)(%r11);
432 vmovdqu RL4, (7*4*4)(%r11);
457 vmovdqu (4*16)(%rdx), RL3;
472 vpxor 4*16+8(%r12), RL3, RL3;
480 vmovdqu RR3, (4*16)(%r11);
507 vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
510 vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
548 vpxor (4*16)(%r12), RR3, RR3;
556 vmovdqu RR3, (4*16)(%r11);