Lines Matching +full:4 +full:- +full:16
4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
29 * chacha_permute - permute one block
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3. It performs matrix operations on four words in parallel,
42 ld1 {v12.4s}, [x10]
45 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46 add v0.4s, v0.4s, v1.4s
47 eor v3.16b, v3.16b, v0.16b
51 add v2.4s, v2.4s, v3.4s
52 eor v4.16b, v1.16b, v2.16b
53 shl v1.4s, v4.4s, #12
54 sri v1.4s, v4.4s, #20
57 add v0.4s, v0.4s, v1.4s
58 eor v3.16b, v3.16b, v0.16b
59 tbl v3.16b, {v3.16b}, v12.16b
62 add v2.4s, v2.4s, v3.4s
63 eor v4.16b, v1.16b, v2.16b
64 shl v1.4s, v4.4s, #7
65 sri v1.4s, v4.4s, #25
68 ext v1.16b, v1.16b, v1.16b, #4
70 ext v2.16b, v2.16b, v2.16b, #8
72 ext v3.16b, v3.16b, v3.16b, #12
74 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75 add v0.4s, v0.4s, v1.4s
76 eor v3.16b, v3.16b, v0.16b
80 add v2.4s, v2.4s, v3.4s
81 eor v4.16b, v1.16b, v2.16b
82 shl v1.4s, v4.4s, #12
83 sri v1.4s, v4.4s, #20
86 add v0.4s, v0.4s, v1.4s
87 eor v3.16b, v3.16b, v0.16b
88 tbl v3.16b, {v3.16b}, v12.16b
91 add v2.4s, v2.4s, v3.4s
92 eor v4.16b, v1.16b, v2.16b
93 shl v1.4s, v4.4s, #7
94 sri v1.4s, v4.4s, #25
97 ext v1.16b, v1.16b, v1.16b, #12
99 ext v2.16b, v2.16b, v2.16b, #8
101 ext v3.16b, v3.16b, v3.16b, #4
115 stp x29, x30, [sp, #-16]!
119 ld1 {v0.4s-v3.4s}, [x0]
120 ld1 {v8.4s-v11.4s}, [x0]
124 ld1 {v4.16b-v7.16b}, [x2]
127 add v0.4s, v0.4s, v8.4s
128 eor v0.16b, v0.16b, v4.16b
131 add v1.4s, v1.4s, v9.4s
132 eor v1.16b, v1.16b, v5.16b
135 add v2.4s, v2.4s, v10.4s
136 eor v2.16b, v2.16b, v6.16b
139 add v3.4s, v3.4s, v11.4s
140 eor v3.16b, v3.16b, v7.16b
142 st1 {v0.16b-v3.16b}, [x1]
144 ldp x29, x30, [sp], #16
150 // x1: output (8 32-bit words)
153 stp x29, x30, [sp, #-16]!
156 ld1 {v0.4s-v3.4s}, [x0]
161 st1 {v0.4s}, [x1], #16
162 st1 {v3.4s}, [x1]
164 ldp x29, x30, [sp], #16
190 // x1: 4 data blocks output, o
191 // x2: 4 data blocks input, i
204 // matrix by interleaving 32- and then 64-bit words, which allows us to
211 ld1 {v30.4s-v31.4s}, [x9]
213 // x0..15[0-3] = s0..3[0..3]
214 add x8, x0, #16
215 ld4r { v0.4s- v3.4s}, [x0]
216 ld4r { v4.4s- v7.4s}, [x8], #16
217 ld4r { v8.4s-v11.4s}, [x8], #16
218 ld4r {v12.4s-v15.4s}, [x8]
237 // x12 += counter values 1-4
238 add v12.4s, v12.4s, v30.4s
241 // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
242 // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
243 // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
244 // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
245 add v0.4s, v0.4s, v4.4s
247 add v1.4s, v1.4s, v5.4s
249 add v2.4s, v2.4s, v6.4s
251 add v3.4s, v3.4s, v7.4s
254 eor v12.16b, v12.16b, v0.16b
256 eor v13.16b, v13.16b, v1.16b
258 eor v14.16b, v14.16b, v2.16b
260 eor v15.16b, v15.16b, v3.16b
264 ror a12, a12, #16
266 ror a13, a13, #16
268 ror a14, a14, #16
270 ror a15, a15, #16
276 add v8.4s, v8.4s, v12.4s
278 add v9.4s, v9.4s, v13.4s
280 add v10.4s, v10.4s, v14.4s
282 add v11.4s, v11.4s, v15.4s
285 eor v16.16b, v4.16b, v8.16b
287 eor v17.16b, v5.16b, v9.16b
289 eor v18.16b, v6.16b, v10.16b
291 eor v19.16b, v7.16b, v11.16b
294 shl v4.4s, v16.4s, #12
295 shl v5.4s, v17.4s, #12
296 shl v6.4s, v18.4s, #12
297 shl v7.4s, v19.4s, #12
299 sri v4.4s, v16.4s, #20
301 sri v5.4s, v17.4s, #20
303 sri v6.4s, v18.4s, #20
305 sri v7.4s, v19.4s, #20
312 add v0.4s, v0.4s, v4.4s
314 add v1.4s, v1.4s, v5.4s
316 add v2.4s, v2.4s, v6.4s
318 add v3.4s, v3.4s, v7.4s
321 eor v12.16b, v12.16b, v0.16b
323 eor v13.16b, v13.16b, v1.16b
325 eor v14.16b, v14.16b, v2.16b
327 eor v15.16b, v15.16b, v3.16b
330 tbl v12.16b, {v12.16b}, v31.16b
332 tbl v13.16b, {v13.16b}, v31.16b
334 tbl v14.16b, {v14.16b}, v31.16b
336 tbl v15.16b, {v15.16b}, v31.16b
343 add v8.4s, v8.4s, v12.4s
345 add v9.4s, v9.4s, v13.4s
347 add v10.4s, v10.4s, v14.4s
349 add v11.4s, v11.4s, v15.4s
352 eor v16.16b, v4.16b, v8.16b
354 eor v17.16b, v5.16b, v9.16b
356 eor v18.16b, v6.16b, v10.16b
358 eor v19.16b, v7.16b, v11.16b
361 shl v4.4s, v16.4s, #7
362 shl v5.4s, v17.4s, #7
363 shl v6.4s, v18.4s, #7
364 shl v7.4s, v19.4s, #7
366 sri v4.4s, v16.4s, #25
368 sri v5.4s, v17.4s, #25
370 sri v6.4s, v18.4s, #25
372 sri v7.4s, v19.4s, #25
375 // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
376 // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
377 // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
378 // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
379 add v0.4s, v0.4s, v5.4s
381 add v1.4s, v1.4s, v6.4s
383 add v2.4s, v2.4s, v7.4s
385 add v3.4s, v3.4s, v4.4s
388 eor v15.16b, v15.16b, v0.16b
390 eor v12.16b, v12.16b, v1.16b
392 eor v13.16b, v13.16b, v2.16b
394 eor v14.16b, v14.16b, v3.16b
398 ror a15, a15, #16
400 ror a12, a12, #16
402 ror a13, a13, #16
404 ror a14, a14, #16
410 add v10.4s, v10.4s, v15.4s
412 add v11.4s, v11.4s, v12.4s
414 add v8.4s, v8.4s, v13.4s
416 add v9.4s, v9.4s, v14.4s
419 eor v16.16b, v5.16b, v10.16b
421 eor v17.16b, v6.16b, v11.16b
423 eor v18.16b, v7.16b, v8.16b
425 eor v19.16b, v4.16b, v9.16b
428 shl v5.4s, v16.4s, #12
429 shl v6.4s, v17.4s, #12
430 shl v7.4s, v18.4s, #12
431 shl v4.4s, v19.4s, #12
433 sri v5.4s, v16.4s, #20
435 sri v6.4s, v17.4s, #20
437 sri v7.4s, v18.4s, #20
439 sri v4.4s, v19.4s, #20
446 add v0.4s, v0.4s, v5.4s
448 add v1.4s, v1.4s, v6.4s
450 add v2.4s, v2.4s, v7.4s
452 add v3.4s, v3.4s, v4.4s
455 eor v15.16b, v15.16b, v0.16b
457 eor v12.16b, v12.16b, v1.16b
459 eor v13.16b, v13.16b, v2.16b
461 eor v14.16b, v14.16b, v3.16b
464 tbl v15.16b, {v15.16b}, v31.16b
466 tbl v12.16b, {v12.16b}, v31.16b
468 tbl v13.16b, {v13.16b}, v31.16b
470 tbl v14.16b, {v14.16b}, v31.16b
477 add v10.4s, v10.4s, v15.4s
479 add v11.4s, v11.4s, v12.4s
481 add v8.4s, v8.4s, v13.4s
483 add v9.4s, v9.4s, v14.4s
486 eor v16.16b, v5.16b, v10.16b
488 eor v17.16b, v6.16b, v11.16b
490 eor v18.16b, v7.16b, v8.16b
492 eor v19.16b, v4.16b, v9.16b
495 shl v5.4s, v16.4s, #7
496 shl v6.4s, v17.4s, #7
497 shl v7.4s, v18.4s, #7
498 shl v4.4s, v19.4s, #7
500 sri v5.4s, v16.4s, #25
502 sri v6.4s, v17.4s, #25
504 sri v7.4s, v18.4s, #25
506 sri v4.4s, v19.4s, #25
512 ld4r {v16.4s-v19.4s}, [x0], #16
513 ld4r {v20.4s-v23.4s}, [x0], #16
515 // x12 += counter values 0-3
516 add v12.4s, v12.4s, v30.4s
518 // x0[0-3] += s0[0]
519 // x1[0-3] += s0[1]
520 // x2[0-3] += s0[2]
521 // x3[0-3] += s0[3]
522 add v0.4s, v0.4s, v16.4s
525 add v1.4s, v1.4s, v17.4s
528 add v2.4s, v2.4s, v18.4s
531 add v3.4s, v3.4s, v19.4s
539 ld4r {v24.4s-v27.4s}, [x0], #16
540 ld4r {v28.4s-v31.4s}, [x0]
542 // x4[0-3] += s1[0]
543 // x5[0-3] += s1[1]
544 // x6[0-3] += s1[2]
545 // x7[0-3] += s1[3]
546 add v4.4s, v4.4s, v20.4s
549 add v5.4s, v5.4s, v21.4s
552 add v6.4s, v6.4s, v22.4s
555 add v7.4s, v7.4s, v23.4s
563 // x8[0-3] += s2[0]
564 // x9[0-3] += s2[1]
565 // x10[0-3] += s2[2]
566 // x11[0-3] += s2[3]
567 add v8.4s, v8.4s, v24.4s
570 add v9.4s, v9.4s, v25.4s
573 add v10.4s, v10.4s, v26.4s
576 add v11.4s, v11.4s, v27.4s
584 // x12[0-3] += s3[0]
585 // x13[0-3] += s3[1]
586 // x14[0-3] += s3[2]
587 // x15[0-3] += s3[3]
588 add v12.4s, v12.4s, v28.4s
591 add v13.4s, v13.4s, v29.4s
594 add v14.4s, v14.4s, v30.4s
597 add v15.4s, v15.4s, v31.4s
605 // interleave 32-bit words in state n, n+1
607 zip1 v16.4s, v0.4s, v1.4s
608 ldp w8, w9, [x2, #-56]
610 zip2 v17.4s, v0.4s, v1.4s
612 zip1 v18.4s, v2.4s, v3.4s
614 zip2 v19.4s, v2.4s, v3.4s
616 ldp w6, w7, [x2, #-48]
617 zip1 v20.4s, v4.4s, v5.4s
618 ldp w8, w9, [x2, #-40]
620 zip2 v21.4s, v4.4s, v5.4s
622 zip1 v22.4s, v6.4s, v7.4s
624 zip2 v23.4s, v6.4s, v7.4s
626 ldp w6, w7, [x2, #-32]
627 zip1 v24.4s, v8.4s, v9.4s
628 ldp w8, w9, [x2, #-24]
630 zip2 v25.4s, v8.4s, v9.4s
632 zip1 v26.4s, v10.4s, v11.4s
634 zip2 v27.4s, v10.4s, v11.4s
636 ldp w6, w7, [x2, #-16]
637 zip1 v28.4s, v12.4s, v13.4s
638 ldp w8, w9, [x2, #-8]
640 zip2 v29.4s, v12.4s, v13.4s
642 zip1 v30.4s, v14.4s, v15.4s
644 zip2 v31.4s, v14.4s, v15.4s
653 // interleave 64-bit words in state n, n+2
659 stp a2, a3, [x1, #-56]
662 ld1 {v16.16b-v19.16b}, [x2], #64
667 stp a4, a5, [x1, #-48]
670 stp a6, a7, [x1, #-40]
673 ld1 {v20.16b-v23.16b}, [x2], #64
678 stp a8, a9, [x1, #-32]
681 stp a10, a11, [x1, #-24]
684 ld1 {v24.16b-v27.16b}, [x2], #64
689 stp a12, a13, [x1, #-16]
692 stp a14, a15, [x1, #-8]
695 ld1 {v28.16b-v31.16b}, [x2]
698 eor v16.16b, v16.16b, v0.16b
699 eor v17.16b, v17.16b, v1.16b
700 eor v18.16b, v18.16b, v2.16b
701 eor v19.16b, v19.16b, v3.16b
705 eor v20.16b, v20.16b, v4.16b
706 eor v21.16b, v21.16b, v5.16b
707 eor v22.16b, v22.16b, v6.16b
708 eor v23.16b, v23.16b, v7.16b
710 st1 {v16.16b-v19.16b}, [x1], #64
713 eor v24.16b, v24.16b, v8.16b
714 eor v25.16b, v25.16b, v9.16b
715 eor v26.16b, v26.16b, v10.16b
716 eor v27.16b, v27.16b, v11.16b
718 st1 {v20.16b-v23.16b}, [x1], #64
721 eor v28.16b, v28.16b, v12.16b
722 eor v29.16b, v29.16b, v13.16b
723 eor v30.16b, v30.16b, v14.16b
724 eor v31.16b, v31.16b, v15.16b
726 st1 {v24.16b-v27.16b}, [x1], #64
727 st1 {v28.16b-v31.16b}, [x1]
734 ld1 {v28.16b-v31.16b}, [x10]
736 tbl v28.16b, {v4.16b-v7.16b}, v28.16b
737 tbl v29.16b, {v4.16b-v7.16b}, v29.16b
738 tbl v30.16b, {v4.16b-v7.16b}, v30.16b
739 tbl v31.16b, {v4.16b-v7.16b}, v31.16b
741 0: eor v20.16b, v20.16b, v28.16b
742 eor v21.16b, v21.16b, v29.16b
743 eor v22.16b, v22.16b, v30.16b
744 eor v23.16b, v23.16b, v31.16b
745 st1 {v20.16b-v23.16b}, [x5] // overlapping stores
746 1: st1 {v16.16b-v19.16b}, [x1]
750 .Lt128: ld1 {v28.16b-v31.16b}, [x10]
753 tbl v28.16b, {v0.16b-v3.16b}, v28.16b
754 tbl v29.16b, {v0.16b-v3.16b}, v29.16b
755 tbl v30.16b, {v0.16b-v3.16b}, v30.16b
756 tbl v31.16b, {v0.16b-v3.16b}, v31.16b
757 ld1 {v16.16b-v19.16b}, [x1] // reload first output block
762 ld1 {v4.16b-v7.16b}, [x10]
764 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
765 tbl v1.16b, {v8.16b-v11.16b}, v5.16b
766 tbl v2.16b, {v8.16b-v11.16b}, v6.16b
767 tbl v3.16b, {v8.16b-v11.16b}, v7.16b
769 eor v28.16b, v28.16b, v0.16b
770 eor v29.16b, v29.16b, v1.16b
771 eor v30.16b, v30.16b, v2.16b
772 eor v31.16b, v31.16b, v3.16b
773 st1 {v28.16b-v31.16b}, [x6] // overlapping stores
774 2: st1 {v20.16b-v23.16b}, [x1]
779 ld1 {v4.16b-v7.16b}, [x10]
781 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
782 tbl v1.16b, {v12.16b-v15.16b}, v5.16b
783 tbl v2.16b, {v12.16b-v15.16b}, v6.16b
784 tbl v3.16b, {v12.16b-v15.16b}, v7.16b
786 eor v28.16b, v28.16b, v0.16b
787 eor v29.16b, v29.16b, v1.16b
788 eor v30.16b, v30.16b, v2.16b
789 eor v31.16b, v31.16b, v3.16b
790 st1 {v28.16b-v31.16b}, [x7] // overlapping stores
791 3: st1 {v24.16b-v27.16b}, [x1]
800 .byte (.Li - 64)
804 CTRINC: .word 1, 2, 3, 4