Lines Matching +full:1 +full:- +full:16
1 /* SPDX-License-Identifier: GPL-2.0 */
10 * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
14 * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
18 * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
20 * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
22 * two-step process only requires 1 finite field reduction for every 8
65 .arch armv8-a+crypto
72 * Computes the product of two 128-bit polynomials in X and Y and XORs the
73 * components of the 256-bit product into LO, MI, HI.
84 * Later, the 256-bit result can be extracted as:
96 ext v25.16b, X.16b, X.16b, #8
97 ext v26.16b, Y.16b, Y.16b, #8
98 eor v25.16b, v25.16b, X.16b
99 eor v26.16b, v26.16b, Y.16b
100 pmull2 v28.1q, X.2d, Y.2d
101 pmull v29.1q, X.1d, Y.1d
102 pmull v27.1q, v25.1d, v26.1d
103 eor HI.16b, HI.16b, v28.16b
104 eor LO.16b, LO.16b, v29.16b
105 eor MI.16b, MI.16b, v27.16b
117 ext v25.16b, X.16b, X.16b, #8
118 ext v26.16b, Y.16b, Y.16b, #8
119 eor v25.16b, v25.16b, X.16b
120 eor v26.16b, v26.16b, Y.16b
121 pmull2 HI.1q, X.2d, Y.2d
122 pmull LO.1q, X.1d, Y.1d
123 pmull MI.1q, v25.1d, v26.1d
129 * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
136 eor v4.16b, HI.16b, MI.16b
138 eor v4.16b, v4.16b, LO.16b
140 ext v5.16b, LO.16b, HI.16b, #8
142 eor v4.16b, v4.16b, v5.16b
144 ext HI.16b, HI.16b, HI.16b, #8
146 ext LO.16b, LO.16b, LO.16b, #8
148 ext PH.16b, v4.16b, HI.16b, #8
150 ext PL.16b, LO.16b, v4.16b, #8
154 * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
157 * x^128 + x^127 + x^126 + x^121 + 1.
159 * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
160 * product of two 128-bit polynomials in Montgomery form. We need to reduce it
173 * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
175 * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
177 * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
178 * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
179 * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
195 pmull TMP_V.1q, PL.1d, GSTAR.1d
197 ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
199 eor TMP_V.16b, PL.16b, TMP_V.16b
201 eor PH.16b, PH.16b, TMP_V.16b
203 pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
204 eor DEST.16b, PH.16b, TMP_V.16b
213 * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
219 eor LO.16b, LO.16b, LO.16b
220 eor MI.16b, MI.16b, MI.16b
221 eor HI.16b, HI.16b, HI.16b
223 ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
224 ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
228 pmull TMP_V.1q, PL.1d, GSTAR.1d
233 ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
238 eor TMP_V.16b, PL.16b, TMP_V.16b
243 eor PH.16b, PH.16b, TMP_V.16b
248 pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
253 eor SUM.16b, PH.16b, TMP_V.16b
257 eor M0.16b, M0.16b, SUM.16b
269 ld1 {KEY1.16b}, [KEY_POWERS], #16
271 ld1 {TMP_V.16b}, [MSG], #16
272 eor SUM.16b, SUM.16b, TMP_V.16b
274 sub BLOCKS_LEFT, BLOCKS_LEFT, #1
278 ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
279 ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
287 ld1 {M0.16b, M1.16b}, [MSG], #32
288 ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
292 tst BLOCKS_LEFT, #1
294 ld1 {M0.16b}, [MSG], #16
295 ld1 {KEY8.16b}, [KEY_POWERS], #16
305 * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
314 ld1 {v0.16b}, [x0]
315 ld1 {v1.16b}, [x1]
319 st1 {SUM.16b}, [x0]
325 * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
328 * x0 - pointer to precomputed key powers h^8 ... h^1
329 * x1 - pointer to message blocks
330 * x2 - number of blocks to hash
331 * x3 - pointer to accumulator
340 ld1 {SUM.16b}, [ACCUMULATOR]
343 ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
344 ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
349 full_stride 1
359 st1 {SUM.16b}, [ACCUMULATOR]