arm64/crypto/polyval-ce-core.S

1 /* SPDX-License-Identifier: GPL-2.0 */
10  * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
14  * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
18  * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
20  * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
22  * two-step process  only requires 1 finite field reduction for every 8
65 	.arch	armv8-a+crypto
72  * Computes the product of two 128-bit polynomials in X and Y and XORs the
73  * components of the 256-bit product into LO, MI, HI.
84  * Later, the 256-bit result can be extracted as:
96 	ext	v25.16b, X.16b, X.16b, #8
97 	ext	v26.16b, Y.16b, Y.16b, #8
98 	eor	v25.16b, v25.16b, X.16b
99 	eor	v26.16b, v26.16b, Y.16b
100 	pmull2	v28.1q, X.2d, Y.2d
101 	pmull	v29.1q, X.1d, Y.1d
102 	pmull	v27.1q, v25.1d, v26.1d
103 	eor	HI.16b, HI.16b, v28.16b
104 	eor	LO.16b, LO.16b, v29.16b
105 	eor	MI.16b, MI.16b, v27.16b
117 	ext	v25.16b, X.16b, X.16b, #8
118 	ext	v26.16b, Y.16b, Y.16b, #8
119 	eor	v25.16b, v25.16b, X.16b
120 	eor	v26.16b, v26.16b, Y.16b
121 	pmull2	HI.1q, X.2d, Y.2d
122 	pmull	LO.1q, X.1d, Y.1d
123 	pmull	MI.1q, v25.1d, v26.1d
129  * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
136 	eor	v4.16b, HI.16b, MI.16b
138 	eor	v4.16b, v4.16b, LO.16b
140 	ext	v5.16b, LO.16b, HI.16b, #8
142 	eor	v4.16b, v4.16b, v5.16b
144 	ext	HI.16b, HI.16b, HI.16b, #8
146 	ext	LO.16b, LO.16b, LO.16b, #8
148 	ext	PH.16b, v4.16b, HI.16b, #8
150 	ext	PL.16b, LO.16b, v4.16b, #8
154  * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
157  * x^128 + x^127 + x^126 + x^121 + 1.
159  * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
160  * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
173  * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
175  * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
177  * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
178  * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
179  * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
195 	pmull	TMP_V.1q, PL.1d, GSTAR.1d
197 	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
199 	eor	TMP_V.16b, PL.16b, TMP_V.16b
201 	eor	PH.16b, PH.16b, TMP_V.16b
203 	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
204 	eor	DEST.16b, PH.16b, TMP_V.16b
213  * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
219 	eor		LO.16b, LO.16b, LO.16b
220 	eor		MI.16b, MI.16b, MI.16b
221 	eor		HI.16b, HI.16b, HI.16b
223 	ld1		{M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
224 	ld1		{M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
228 	pmull	TMP_V.1q, PL.1d, GSTAR.1d
233 	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
238 	eor	TMP_V.16b, PL.16b, TMP_V.16b
243 	eor	PH.16b, PH.16b, TMP_V.16b
248 	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
253 	eor	SUM.16b, PH.16b, TMP_V.16b
257 	eor	M0.16b, M0.16b, SUM.16b
269 	ld1	{KEY1.16b}, [KEY_POWERS], #16
271 	ld1	{TMP_V.16b}, [MSG], #16
272 	eor	SUM.16b, SUM.16b, TMP_V.16b
274 	sub	BLOCKS_LEFT, BLOCKS_LEFT, #1
278 	ld1	{M0.16b, M1.16b,  M2.16b, M3.16b}, [MSG], #64
279 	ld1	{KEY8.16b, KEY7.16b, KEY6.16b,	KEY5.16b}, [KEY_POWERS], #64
287 	ld1	{M0.16b, M1.16b}, [MSG], #32
288 	ld1	{KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
292 	tst	BLOCKS_LEFT, #1
294 	ld1	{M0.16b}, [MSG], #16
295 	ld1	{KEY8.16b}, [KEY_POWERS], #16
305  * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
314 	ld1	{v0.16b}, [x0]
315 	ld1	{v1.16b}, [x1]
319 	st1	{SUM.16b}, [x0]
325  *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
328  * x0 - pointer to precomputed key powers h^8 ... h^1
329  * x1 - pointer to message blocks
330  * x2 - number of blocks to hash
331  * x3 - pointer to accumulator
340 	ld1	{SUM.16b}, [ACCUMULATOR]
343 	ld1	{KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
344 	ld1	{KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
349 	full_stride 1
359 	st1	{SUM.16b}, [ACCUMULATOR]