Lines Matching +full:3 +full:- +full:9
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
5 # Copyright 2023- IBM Corp. All rights reserved
10 # Poly1305 - this version mainly using vector/VSX/Scalar
11 # - 26 bits limbs
12 # - Handle multiple 64 byte blcok.
17 # p = 2^130 - 5
25 # 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, …
26 # to 9 vectors for multiplications.
28 # setup r^4, r^3, r^2, r vectors
29 # vs [r^1, r^3, r^2, r^4]
56 #include <asm/asm-offsets.h>
57 #include <asm/asm-compat.h>
95 stdu 1,-752(1)
116 addi 9, 1, 256
117 SAVE_VRS 20, 0, 9
118 SAVE_VRS 21, 16, 9
119 SAVE_VRS 22, 32, 9
120 SAVE_VRS 23, 48, 9
121 SAVE_VRS 24, 64, 9
122 SAVE_VRS 25, 80, 9
123 SAVE_VRS 26, 96, 9
124 SAVE_VRS 27, 112, 9
125 SAVE_VRS 28, 128, 9
126 SAVE_VRS 29, 144, 9
127 SAVE_VRS 30, 160, 9
128 SAVE_VRS 31, 176, 9
130 SAVE_VSX 14, 192, 9
131 SAVE_VSX 15, 208, 9
132 SAVE_VSX 16, 224, 9
133 SAVE_VSX 17, 240, 9
134 SAVE_VSX 18, 256, 9
135 SAVE_VSX 19, 272, 9
136 SAVE_VSX 20, 288, 9
137 SAVE_VSX 21, 304, 9
138 SAVE_VSX 22, 320, 9
139 SAVE_VSX 23, 336, 9
140 SAVE_VSX 24, 352, 9
141 SAVE_VSX 25, 368, 9
142 SAVE_VSX 26, 384, 9
143 SAVE_VSX 27, 400, 9
144 SAVE_VSX 28, 416, 9
145 SAVE_VSX 29, 432, 9
146 SAVE_VSX 30, 448, 9
147 SAVE_VSX 31, 464, 9
151 addi 9, 1, 256
152 RESTORE_VRS 20, 0, 9
153 RESTORE_VRS 21, 16, 9
154 RESTORE_VRS 22, 32, 9
155 RESTORE_VRS 23, 48, 9
156 RESTORE_VRS 24, 64, 9
157 RESTORE_VRS 25, 80, 9
158 RESTORE_VRS 26, 96, 9
159 RESTORE_VRS 27, 112, 9
160 RESTORE_VRS 28, 128, 9
161 RESTORE_VRS 29, 144, 9
162 RESTORE_VRS 30, 160, 9
163 RESTORE_VRS 31, 176, 9
165 RESTORE_VSX 14, 192, 9
166 RESTORE_VSX 15, 208, 9
167 RESTORE_VSX 16, 224, 9
168 RESTORE_VSX 17, 240, 9
169 RESTORE_VSX 18, 256, 9
170 RESTORE_VSX 19, 272, 9
171 RESTORE_VSX 20, 288, 9
172 RESTORE_VSX 21, 304, 9
173 RESTORE_VSX 22, 320, 9
174 RESTORE_VSX 23, 336, 9
175 RESTORE_VSX 24, 352, 9
176 RESTORE_VSX 25, 368, 9
177 RESTORE_VSX 26, 384, 9
178 RESTORE_VSX 27, 400, 9
179 RESTORE_VSX 28, 416, 9
180 RESTORE_VSX 29, 432, 9
181 RESTORE_VSX 30, 448, 9
182 RESTORE_VSX 31, 464, 9
212 # p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;
215 # [r^2, r^3, r^1, r^4]
221 vmulouw 10, 5, 3
229 vmulouw 11, 6, 3
243 vmulouw 12, 7, 3
253 vmulouw 13, 8, 3
268 vmuleuw 9, 4, 26
269 vmuleuw 10, 5, 3
273 vaddudm 14, 14, 9
279 vmuleuw 9, 4, 27
281 vmuleuw 11, 6, 3
284 vaddudm 15, 15, 9
290 vmuleuw 9, 4, 28
293 vmuleuw 12, 7, 3
295 vaddudm 16, 16, 9
301 vmuleuw 9, 4, 29
305 vmuleuw 13, 8, 3
306 vaddudm 17, 17, 9
312 vmuleuw 9, 4, 30
317 vaddudm 18, 18, 9
327 # setup r^4, r^3, r^2, r vectors
328 # [r, r^3, r^2, r^4]
356 # [r, r^3, r^2, r^4]
375 vsld 9, 27, 13
379 vaddudm 0, 9, 27
382 vaddudm 3, 12, 30
384 bl do_mul # r^4 r^3
391 vsld 9, 27, 13
395 vaddudm 0, 9, 27
398 vaddudm 3, 12, 30
404 xxlor 3, 61, 61
411 vspltw 9, 26, 3
413 vmrgow 26, 10, 9
414 vspltw 9, 27, 3
416 vmrgow 27, 10, 9
417 vspltw 9, 28, 3
419 vmrgow 28, 10, 9
420 vspltw 9, 29, 3
422 vmrgow 29, 10, 9
423 vspltw 9, 30, 3
425 vmrgow 30, 10, 9
427 vsld 9, 27, 13
431 vaddudm 0, 9, 27
434 vaddudm 3, 12, 30
442 vspltisb 9, 2
455 vsld 10, 12, 9
486 lvx 25, 0, 10 # v25 - mask
494 ld 9, 24(3)
495 ld 10, 32(3)
496 and. 9, 9, 11
500 extrdi 14, 9, 26, 38
501 extrdi 15, 9, 26, 12
502 extrdi 16, 9, 12, 0
513 li 9, 5
514 mtvsrdd 36, 0, 9
518 vmulouw 3, 30, 4 # v3 = rr3
543 ld 9, 0(3)
544 ld 10, 8(3)
545 ld 19, 16(3)
548 extrdi 14, 9, 26, 38
549 extrdi 15, 9, 26, 12
550 extrdi 16, 9, 12, 0
559 vor 8, 8, 9
570 vand 9, 14, 25 # a0
584 vaddudm 20, 4, 9
597 vand 9, 14, 25 # a0
612 vmrgow 4, 9, 20
619 addi 5, 5, -64 # len -= 64
622 li 9, 64
623 divdu 31, 5, 9
633 # h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h…
635 # h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->
636 # h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
644 vspltisb 9, 2
657 vsld 10, 12, 9
689 vand 9, 17, 25 # a0
722 vmrgow 4, 9, 4
729 addi 5, 5, -64 # len -= 64
738 xxlor 61, 3, 3
752 vaddudm 4, 14, 9
753 xxpermdi 36, 31, 36, 3
755 xxpermdi 37, 31, 37, 3
758 xxpermdi 38, 31, 38, 3
761 xxpermdi 39, 31, 39, 3
764 xxpermdi 40, 31, 40, 3
767 vspltisb 9, 2
780 vsld 10, 12, 9
821 std 17, 0(3)
822 std 19, 8(3)
823 stw 16, 16(3)
826 li 3, 0
833 li 3, 0
851 ld 9, 24(3)
852 ld 10, 32(3)
853 and. 9, 9, 11 # cramp mask r0
857 add 19, 21, 10 # s1: r19 - (r1 >> 2) *5
861 mtvsrdd 32+0, 9, 19 # r0, s1
862 mtvsrdd 32+1, 10, 9 # r1, r0
864 mtvsrdd 32+3, 9, 25 # r0
878 vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1
881 vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0
885 vmsumudm 11, 8, 3, 9 # d2 = h2 * r0
907 add 23, 23, 22 # (h2 & 3) * 5
923 # - no highbit if final leftover block (highbit = 0)
931 stdu 1,-400(1)
961 ld 27, 0(3)
962 ld 28, 8(3)
963 lwz 29, 16(3)
973 vxor 9, 9, 9
993 std 27, 0(3)
994 std 28, 8(3)
995 stw 29, 16(3)
997 li 3, 0
1025 li 3, 0
1034 ld 10, 0(3)
1035 ld 11, 8(3)
1036 ld 12, 16(3)
1039 # h + 5 + (-p)
1046 srdi 9, 8, 2 # overflow?
1047 cmpdi 9, 0