Lines Matching +full:4 +full:-

4  * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
29 * chacha_permute - permute one block
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3. It performs matrix operations on four words in parallel,
42 ld1 {v12.4s}, [x10]
46 add v0.4s, v0.4s, v1.4s
51 add v2.4s, v2.4s, v3.4s
53 shl v1.4s, v4.4s, #12
54 sri v1.4s, v4.4s, #20
57 add v0.4s, v0.4s, v1.4s
62 add v2.4s, v2.4s, v3.4s
64 shl v1.4s, v4.4s, #7
65 sri v1.4s, v4.4s, #25
68 ext v1.16b, v1.16b, v1.16b, #4
75 add v0.4s, v0.4s, v1.4s
80 add v2.4s, v2.4s, v3.4s
82 shl v1.4s, v4.4s, #12
83 sri v1.4s, v4.4s, #20
86 add v0.4s, v0.4s, v1.4s
91 add v2.4s, v2.4s, v3.4s
93 shl v1.4s, v4.4s, #7
94 sri v1.4s, v4.4s, #25
101 ext v3.16b, v3.16b, v3.16b, #4
115 stp x29, x30, [sp, #-16]!
119 ld1 {v0.4s-v3.4s}, [x0]
120 ld1 {v8.4s-v11.4s}, [x0]
124 ld1 {v4.16b-v7.16b}, [x2]
127 add v0.4s, v0.4s, v8.4s
131 add v1.4s, v1.4s, v9.4s
135 add v2.4s, v2.4s, v10.4s
139 add v3.4s, v3.4s, v11.4s
142 st1 {v0.16b-v3.16b}, [x1]
150 // x1: output (8 32-bit words)
153 stp x29, x30, [sp, #-16]!
156 ld1 {v0.4s-v3.4s}, [x0]
161 st1 {v0.4s}, [x1], #16
162 st1 {v3.4s}, [x1]
190 // x1: 4 data blocks output, o
191 // x2: 4 data blocks input, i
204 // matrix by interleaving 32- and then 64-bit words, which allows us to
211 ld1 {v30.4s-v31.4s}, [x9]
213 // x0..15[0-3] = s0..3[0..3]
215 ld4r { v0.4s- v3.4s}, [x0]
216 ld4r { v4.4s- v7.4s}, [x8], #16
217 ld4r { v8.4s-v11.4s}, [x8], #16
218 ld4r {v12.4s-v15.4s}, [x8]
237 // x12 += counter values 1-4
238 add v12.4s, v12.4s, v30.4s
245 add v0.4s, v0.4s, v4.4s
247 add v1.4s, v1.4s, v5.4s
249 add v2.4s, v2.4s, v6.4s
251 add v3.4s, v3.4s, v7.4s
276 add v8.4s, v8.4s, v12.4s
278 add v9.4s, v9.4s, v13.4s
280 add v10.4s, v10.4s, v14.4s
282 add v11.4s, v11.4s, v15.4s
294 shl v4.4s, v16.4s, #12
295 shl v5.4s, v17.4s, #12
296 shl v6.4s, v18.4s, #12
297 shl v7.4s, v19.4s, #12
299 sri v4.4s, v16.4s, #20
301 sri v5.4s, v17.4s, #20
303 sri v6.4s, v18.4s, #20
305 sri v7.4s, v19.4s, #20
312 add v0.4s, v0.4s, v4.4s
314 add v1.4s, v1.4s, v5.4s
316 add v2.4s, v2.4s, v6.4s
318 add v3.4s, v3.4s, v7.4s
343 add v8.4s, v8.4s, v12.4s
345 add v9.4s, v9.4s, v13.4s
347 add v10.4s, v10.4s, v14.4s
349 add v11.4s, v11.4s, v15.4s
361 shl v4.4s, v16.4s, #7
362 shl v5.4s, v17.4s, #7
363 shl v6.4s, v18.4s, #7
364 shl v7.4s, v19.4s, #7
366 sri v4.4s, v16.4s, #25
368 sri v5.4s, v17.4s, #25
370 sri v6.4s, v18.4s, #25
372 sri v7.4s, v19.4s, #25
379 add v0.4s, v0.4s, v5.4s
381 add v1.4s, v1.4s, v6.4s
383 add v2.4s, v2.4s, v7.4s
385 add v3.4s, v3.4s, v4.4s
410 add v10.4s, v10.4s, v15.4s
412 add v11.4s, v11.4s, v12.4s
414 add v8.4s, v8.4s, v13.4s
416 add v9.4s, v9.4s, v14.4s
428 shl v5.4s, v16.4s, #12
429 shl v6.4s, v17.4s, #12
430 shl v7.4s, v18.4s, #12
431 shl v4.4s, v19.4s, #12
433 sri v5.4s, v16.4s, #20
435 sri v6.4s, v17.4s, #20
437 sri v7.4s, v18.4s, #20
439 sri v4.4s, v19.4s, #20
446 add v0.4s, v0.4s, v5.4s
448 add v1.4s, v1.4s, v6.4s
450 add v2.4s, v2.4s, v7.4s
452 add v3.4s, v3.4s, v4.4s
477 add v10.4s, v10.4s, v15.4s
479 add v11.4s, v11.4s, v12.4s
481 add v8.4s, v8.4s, v13.4s
483 add v9.4s, v9.4s, v14.4s
495 shl v5.4s, v16.4s, #7
496 shl v6.4s, v17.4s, #7
497 shl v7.4s, v18.4s, #7
498 shl v4.4s, v19.4s, #7
500 sri v5.4s, v16.4s, #25
502 sri v6.4s, v17.4s, #25
504 sri v7.4s, v18.4s, #25
506 sri v4.4s, v19.4s, #25
512 ld4r {v16.4s-v19.4s}, [x0], #16
513 ld4r {v20.4s-v23.4s}, [x0], #16
515 // x12 += counter values 0-3
516 add v12.4s, v12.4s, v30.4s
518 // x0[0-3] += s0[0]
519 // x1[0-3] += s0[1]
520 // x2[0-3] += s0[2]
521 // x3[0-3] += s0[3]
522 add v0.4s, v0.4s, v16.4s
525 add v1.4s, v1.4s, v17.4s
528 add v2.4s, v2.4s, v18.4s
531 add v3.4s, v3.4s, v19.4s
539 ld4r {v24.4s-v27.4s}, [x0], #16
540 ld4r {v28.4s-v31.4s}, [x0]
542 // x4[0-3] += s1[0]
543 // x5[0-3] += s1[1]
544 // x6[0-3] += s1[2]
545 // x7[0-3] += s1[3]
546 add v4.4s, v4.4s, v20.4s
549 add v5.4s, v5.4s, v21.4s
552 add v6.4s, v6.4s, v22.4s
555 add v7.4s, v7.4s, v23.4s
563 // x8[0-3] += s2[0]
564 // x9[0-3] += s2[1]
565 // x10[0-3] += s2[2]
566 // x11[0-3] += s2[3]
567 add v8.4s, v8.4s, v24.4s
570 add v9.4s, v9.4s, v25.4s
573 add v10.4s, v10.4s, v26.4s
576 add v11.4s, v11.4s, v27.4s
584 // x12[0-3] += s3[0]
585 // x13[0-3] += s3[1]
586 // x14[0-3] += s3[2]
587 // x15[0-3] += s3[3]
588 add v12.4s, v12.4s, v28.4s
591 add v13.4s, v13.4s, v29.4s
594 add v14.4s, v14.4s, v30.4s
597 add v15.4s, v15.4s, v31.4s
605 // interleave 32-bit words in state n, n+1
607 zip1 v16.4s, v0.4s, v1.4s
608 ldp w8, w9, [x2, #-56]
610 zip2 v17.4s, v0.4s, v1.4s
612 zip1 v18.4s, v2.4s, v3.4s
614 zip2 v19.4s, v2.4s, v3.4s
616 ldp w6, w7, [x2, #-48]
617 zip1 v20.4s, v4.4s, v5.4s
618 ldp w8, w9, [x2, #-40]
620 zip2 v21.4s, v4.4s, v5.4s
622 zip1 v22.4s, v6.4s, v7.4s
624 zip2 v23.4s, v6.4s, v7.4s
626 ldp w6, w7, [x2, #-32]
627 zip1 v24.4s, v8.4s, v9.4s
628 ldp w8, w9, [x2, #-24]
630 zip2 v25.4s, v8.4s, v9.4s
632 zip1 v26.4s, v10.4s, v11.4s
634 zip2 v27.4s, v10.4s, v11.4s
636 ldp w6, w7, [x2, #-16]
637 zip1 v28.4s, v12.4s, v13.4s
638 ldp w8, w9, [x2, #-8]
640 zip2 v29.4s, v12.4s, v13.4s
642 zip1 v30.4s, v14.4s, v15.4s
644 zip2 v31.4s, v14.4s, v15.4s
653 // interleave 64-bit words in state n, n+2
659 stp a2, a3, [x1, #-56]
662 ld1 {v16.16b-v19.16b}, [x2], #64
667 stp a4, a5, [x1, #-48]
670 stp a6, a7, [x1, #-40]
673 ld1 {v20.16b-v23.16b}, [x2], #64
678 stp a8, a9, [x1, #-32]
681 stp a10, a11, [x1, #-24]
684 ld1 {v24.16b-v27.16b}, [x2], #64
689 stp a12, a13, [x1, #-16]
692 stp a14, a15, [x1, #-8]
695 ld1 {v28.16b-v31.16b}, [x2]
710 st1 {v16.16b-v19.16b}, [x1], #64
718 st1 {v20.16b-v23.16b}, [x1], #64
726 st1 {v24.16b-v27.16b}, [x1], #64
727 st1 {v28.16b-v31.16b}, [x1]
734 ld1 {v28.16b-v31.16b}, [x10]
736 tbl v28.16b, {v4.16b-v7.16b}, v28.16b
737 tbl v29.16b, {v4.16b-v7.16b}, v29.16b
738 tbl v30.16b, {v4.16b-v7.16b}, v30.16b
739 tbl v31.16b, {v4.16b-v7.16b}, v31.16b
745 st1 {v20.16b-v23.16b}, [x5] // overlapping stores
746 1: st1 {v16.16b-v19.16b}, [x1]
750 .Lt128: ld1 {v28.16b-v31.16b}, [x10]
753 tbl v28.16b, {v0.16b-v3.16b}, v28.16b
754 tbl v29.16b, {v0.16b-v3.16b}, v29.16b
755 tbl v30.16b, {v0.16b-v3.16b}, v30.16b
756 tbl v31.16b, {v0.16b-v3.16b}, v31.16b
757 ld1 {v16.16b-v19.16b}, [x1] // reload first output block
762 ld1 {v4.16b-v7.16b}, [x10]
764 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
765 tbl v1.16b, {v8.16b-v11.16b}, v5.16b
766 tbl v2.16b, {v8.16b-v11.16b}, v6.16b
767 tbl v3.16b, {v8.16b-v11.16b}, v7.16b
773 st1 {v28.16b-v31.16b}, [x6] // overlapping stores
774 2: st1 {v20.16b-v23.16b}, [x1]
779 ld1 {v4.16b-v7.16b}, [x10]
781 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
782 tbl v1.16b, {v12.16b-v15.16b}, v5.16b
783 tbl v2.16b, {v12.16b-v15.16b}, v6.16b
784 tbl v3.16b, {v12.16b-v15.16b}, v7.16b
790 st1 {v28.16b-v31.16b}, [x7] // overlapping stores
791 3: st1 {v24.16b-v27.16b}, [x1]
800 .byte (.Li - 64)
804 CTRINC: .word 1, 2, 3, 4