Lines Matching +full:1 +full:- +full:16
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
14 #include "sm4-ce-asm.h"
16 .arch armv8-a+crypto
18 .irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
37 * output: r0:r1 (low 128-bits in r0, high in r1)
40 ext T0.16b, m1.16b, m1.16b, #8; \
41 pmull r0.1q, m0.1d, m1.1d; \
42 pmull T1.1q, m0.1d, T0.1d; \
43 pmull2 T0.1q, m0.2d, T0.2d; \
44 pmull2 r1.1q, m0.2d, m1.2d; \
45 eor T0.16b, T0.16b, T1.16b; \
46 ext T1.16b, RZERO.16b, T0.16b, #8; \
47 ext T0.16b, T0.16b, RZERO.16b, #8; \
48 eor r0.16b, r0.16b, T1.16b; \
49 eor r1.16b, r1.16b, T0.16b;
55 ext T0.16b, m1.16b, m1.16b, #8; \
56 ext T2.16b, m3.16b, m3.16b, #8; \
57 ext T4.16b, m5.16b, m5.16b, #8; \
58 ext T6.16b, m7.16b, m7.16b, #8; \
59 pmull r0.1q, m0.1d, m1.1d; \
60 pmull r2.1q, m2.1d, m3.1d; \
61 pmull r4.1q, m4.1d, m5.1d; \
62 pmull r6.1q, m6.1d, m7.1d; \
63 pmull T1.1q, m0.1d, T0.1d; \
64 pmull T3.1q, m2.1d, T2.1d; \
65 pmull T5.1q, m4.1d, T4.1d; \
66 pmull T7.1q, m6.1d, T6.1d; \
67 pmull2 T0.1q, m0.2d, T0.2d; \
68 pmull2 T2.1q, m2.2d, T2.2d; \
69 pmull2 T4.1q, m4.2d, T4.2d; \
70 pmull2 T6.1q, m6.2d, T6.2d; \
71 pmull2 r1.1q, m0.2d, m1.2d; \
72 pmull2 r3.1q, m2.2d, m3.2d; \
73 pmull2 r5.1q, m4.2d, m5.2d; \
74 pmull2 r7.1q, m6.2d, m7.2d; \
75 eor T0.16b, T0.16b, T1.16b; \
76 eor T2.16b, T2.16b, T3.16b; \
77 eor T4.16b, T4.16b, T5.16b; \
78 eor T6.16b, T6.16b, T7.16b; \
79 ext T1.16b, RZERO.16b, T0.16b, #8; \
80 ext T3.16b, RZERO.16b, T2.16b, #8; \
81 ext T5.16b, RZERO.16b, T4.16b, #8; \
82 ext T7.16b, RZERO.16b, T6.16b, #8; \
83 ext T0.16b, T0.16b, RZERO.16b, #8; \
84 ext T2.16b, T2.16b, RZERO.16b, #8; \
85 ext T4.16b, T4.16b, RZERO.16b, #8; \
86 ext T6.16b, T6.16b, RZERO.16b, #8; \
87 eor r0.16b, r0.16b, T1.16b; \
88 eor r2.16b, r2.16b, T3.16b; \
89 eor r4.16b, r4.16b, T5.16b; \
90 eor r6.16b, r6.16b, T7.16b; \
91 eor r1.16b, r1.16b, T0.16b; \
92 eor r3.16b, r3.16b, T2.16b; \
93 eor r5.16b, r5.16b, T4.16b; \
94 eor r7.16b, r7.16b, T6.16b;
97 * input: r0:r1 (low 128-bits in r0, high in r1)
101 pmull2 T0.1q, r1.2d, rconst.2d; \
102 ext T1.16b, T0.16b, RZERO.16b, #8; \
103 ext T0.16b, RZERO.16b, T0.16b, #8; \
104 eor r1.16b, r1.16b, T1.16b; \
105 eor r0.16b, r0.16b, T0.16b; \
106 pmull T0.1q, r1.1d, rconst.1d; \
107 eor a.16b, r0.16b, T0.16b;
110 rev32 b0.16b, b0.16b; \
111 ext T0.16b, m1.16b, m1.16b, #8; \
113 pmull r0.1q, m0.1d, m1.1d; \
115 pmull T1.1q, m0.1d, T0.1d; \
117 pmull2 T0.1q, m0.2d, T0.2d; \
119 pmull2 r1.1q, m0.2d, m1.2d; \
121 eor T0.16b, T0.16b, T1.16b; \
123 ext T1.16b, RZERO.16b, T0.16b, #8; \
125 ext T0.16b, T0.16b, RZERO.16b, #8; \
127 eor r0.16b, r0.16b, T1.16b; \
129 eor r1.16b, r1.16b, T0.16b; \
130 ext b0.16b, b0.16b, b0.16b, #8; \
131 rev32 b0.16b, b0.16b;
137 rev32 b0.16b, b0.16b; \
138 rev32 b1.16b, b1.16b; \
139 rev32 b2.16b, b2.16b; \
140 ext T0.16b, m1.16b, m1.16b, #8; \
141 ext T2.16b, m3.16b, m3.16b, #8; \
142 ext T4.16b, m5.16b, m5.16b, #8; \
146 pmull r0.1q, m0.1d, m1.1d; \
147 pmull r2.1q, m2.1d, m3.1d; \
148 pmull r4.1q, m4.1d, m5.1d; \
152 pmull T1.1q, m0.1d, T0.1d; \
153 pmull T3.1q, m2.1d, T2.1d; \
154 pmull T5.1q, m4.1d, T4.1d; \
158 pmull2 T0.1q, m0.2d, T0.2d; \
159 pmull2 T2.1q, m2.2d, T2.2d; \
160 pmull2 T4.1q, m4.2d, T4.2d; \
164 pmull2 r1.1q, m0.2d, m1.2d; \
165 pmull2 r3.1q, m2.2d, m3.2d; \
166 pmull2 r5.1q, m4.2d, m5.2d; \
170 eor T0.16b, T0.16b, T1.16b; \
171 eor T2.16b, T2.16b, T3.16b; \
172 eor T4.16b, T4.16b, T5.16b; \
176 ext T1.16b, RZERO.16b, T0.16b, #8; \
177 ext T3.16b, RZERO.16b, T2.16b, #8; \
178 ext T5.16b, RZERO.16b, T4.16b, #8; \
182 ext T0.16b, T0.16b, RZERO.16b, #8; \
183 ext T2.16b, T2.16b, RZERO.16b, #8; \
184 ext T4.16b, T4.16b, RZERO.16b, #8; \
188 eor r0.16b, r0.16b, T1.16b; \
189 eor r2.16b, r2.16b, T3.16b; \
190 eor r4.16b, r4.16b, T5.16b; \
194 eor r1.16b, r1.16b, T0.16b; \
195 eor r3.16b, r3.16b, T2.16b; \
196 eor r5.16b, r5.16b, T4.16b; \
197 ext b0.16b, b0.16b, b0.16b, #8; \
198 ext b1.16b, b1.16b, b1.16b, #8; \
199 ext b2.16b, b2.16b, b2.16b, #8; \
200 eor r0.16b, r0.16b, r2.16b; \
201 eor r1.16b, r1.16b, r3.16b; \
202 rev32 b0.16b, b0.16b; \
203 rev32 b1.16b, b1.16b; \
204 rev32 b2.16b, b2.16b; \
205 eor r0.16b, r0.16b, r4.16b; \
206 eor r1.16b, r1.16b, r5.16b;
209 mov vctr.d[1], x9; \
210 add w6, w9, #1; \
213 rev64 vctr.16b, vctr.16b;
216 ld1 {vlen.16b}, [x7]; \
218 /* the lower 32-bits of initial IV is always be32(1) */ \
222 mov vctr0.d[1], x9; \
223 rbit vlen.16b, vlen.16b; \
224 rev64 vctr0.16b, vctr0.16b; \
226 eor RHASH.16b, RHASH.16b, vlen.16b; \
230 rbit RHASH.16b, RHASH.16b; \
231 eor RHASH.16b, RHASH.16b, vctr0.16b;
236 /* can be the same as input v0-v3 */
272 eor RZERO.16b, RZERO.16b, RZERO.16b
275 rev32 v0.16b, RZERO.16b
278 /* H ^ 1 */
279 rbit RH1.16b, v0.16b
293 st1 {RH1.16b-RH4.16b}, [x1]
306 ld1 {RH1.16b-RH4.16b}, [x0]
308 ld1 {RHASH.16b}, [x1]
309 rbit RHASH.16b, RHASH.16b
314 eor RZERO.16b, RZERO.16b, RZERO.16b
322 ld1 {v0.16b-v3.16b}, [x2], #64
324 rbit v0.16b, v0.16b
325 rbit v1.16b, v1.16b
326 rbit v2.16b, v2.16b
327 rbit v3.16b, v3.16b
333 * (in3) * H^1 => rr6:rr7
335 eor RHASH.16b, RHASH.16b, v0.16b
342 eor RR0.16b, RR0.16b, RR2.16b
343 eor RR1.16b, RR1.16b, RR3.16b
344 eor RR0.16b, RR0.16b, RR4.16b
345 eor RR1.16b, RR1.16b, RR5.16b
346 eor RR0.16b, RR0.16b, RR6.16b
347 eor RR1.16b, RR1.16b, RR7.16b
355 sub w3, w3, #1
357 ld1 {v0.16b}, [x2], #16
358 rbit v0.16b, v0.16b
359 eor RHASH.16b, RHASH.16b, v0.16b
367 rbit RHASH.16b, RHASH.16b
391 ld1 {RH1.16b-RH4.16b}, [x6]
393 ld1 {RHASH.16b}, [x5]
394 rbit RHASH.16b, RHASH.16b
399 eor RZERO.16b, RZERO.16b, RZERO.16b
404 cmp w4, #(4 * 16)
407 sub w4, w4, #(4 * 16)
411 inc32_le128(v1) /* +1 */
415 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
419 eor v0.16b, v0.16b, RTMP0.16b
420 eor v1.16b, v1.16b, RTMP1.16b
421 eor v2.16b, v2.16b, RTMP2.16b
422 eor v3.16b, v3.16b, RTMP3.16b
423 st1 {v0.16b-v3.16b}, [x1], #64
427 rbit v0.16b, v0.16b
428 rbit v1.16b, v1.16b
429 rbit v2.16b, v2.16b
430 rbit v3.16b, v3.16b
436 * (in3) * H^1 => rr6:rr7
438 eor RHASH.16b, RHASH.16b, v0.16b
445 eor RR0.16b, RR0.16b, RR2.16b
446 eor RR1.16b, RR1.16b, RR3.16b
447 eor RR0.16b, RR0.16b, RR4.16b
448 eor RR1.16b, RR1.16b, RR5.16b
449 eor RR0.16b, RR0.16b, RR6.16b
450 eor RR1.16b, RR1.16b, RR7.16b
458 cmp w4, #16
461 sub w4, w4, #16
466 ld1 {RTMP0.16b}, [x2], #16
470 eor v0.16b, v0.16b, RTMP0.16b
471 st1 {v0.16b}, [x1], #16
474 rbit v0.16b, v0.16b
475 eor RHASH.16b, RHASH.16b, v0.16b
491 ld1 {v3.16b}, [x0]
495 ldrb w0, [x2], #1 /* get 1 byte from input */
498 strb w6, [x1], #1 /* store out byte */
501 ext v0.16b, v0.16b, v0.16b, #1
505 subs w4, w4, #1
509 tbl v0.16b, {v0.16b}, v3.16b
512 rbit v0.16b, v0.16b
513 eor RHASH.16b, RHASH.16b, v0.16b
530 rbit RHASH.16b, RHASH.16b
563 /* v0-v2 for building CTRs, v3-v5 for saving inputs */
602 ld1 {RH1.16b-RH3.16b}, [x6]
604 ld1 {RHASH.16b}, [x5]
605 rbit RHASH.16b, RHASH.16b
610 eor RZERO.16b, RZERO.16b, RZERO.16b
615 cmp w4, #(3 * 16)
618 sub w4, w4, #(3 * 16)
620 ld1 {v3.16b-v5.16b}, [x2], #(3 * 16)
624 rbit v6.16b, v3.16b
625 inc32_le128(v1) /* +1 */
626 rbit v7.16b, v4.16b
628 rbit v8.16b, v5.16b
630 eor RHASH.16b, RHASH.16b, v6.16b
638 eor v0.16b, v0.16b, v3.16b
639 eor v1.16b, v1.16b, v4.16b
640 eor v2.16b, v2.16b, v5.16b
644 st1 {v0.16b-v2.16b}, [x1], #(3 * 16)
650 cmp w4, #16
653 sub w4, w4, #16
655 ld1 {v3.16b}, [x2], #16
659 rbit v6.16b, v3.16b
661 eor RHASH.16b, RHASH.16b, v6.16b
665 eor v0.16b, v0.16b, v3.16b
669 st1 {v0.16b}, [x1], #16
683 ld1 {v3.16b}, [x0]
687 ldrb w0, [x2], #1 /* get 1 byte from input */
690 strb w6, [x1], #1 /* store out byte */
693 ext v0.16b, v0.16b, v0.16b, #1
697 subs w4, w4, #1
701 tbl v0.16b, {v0.16b}, v3.16b
704 rbit v0.16b, v0.16b
705 eor RHASH.16b, RHASH.16b, v0.16b
722 rbit RHASH.16b, RHASH.16b