1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
4 */
5
6#include <asm/asm.h>
7#include <asm/regdef.h>
8#include <linux/linkage.h>
9
10.text
11
12.macro	OP_4REG	op d0 d1 d2 d3 s0 s1 s2 s3
13	\op	\d0, \d0, \s0
14	\op	\d1, \d1, \s1
15	\op	\d2, \d2, \s2
16	\op	\d3, \d3, \s3
17.endm
18
19/*
20 * Very basic LoongArch implementation of ChaCha20. Produces a given positive
21 * number of blocks of output with a nonce of 0, taking an input key and
22 * 8-byte counter. Importantly does not spill to the stack. Its arguments
23 * are:
24 *
25 *	a0: output bytes
26 *	a1: 32-byte key input
27 *	a2: 8-byte counter input/output
28 *	a3: number of 64-byte blocks to write to output
29 */
30SYM_FUNC_START(__arch_chacha20_blocks_nostack)
31
32/* We don't need a frame pointer */
33#define s9		fp
34
35#define output		a0
36#define key		a1
37#define counter		a2
38#define nblocks		a3
39#define i		a4
40#define state0		s0
41#define state1		s1
42#define state2		s2
43#define state3		s3
44#define state4		s4
45#define state5		s5
46#define state6		s6
47#define state7		s7
48#define state8		s8
49#define state9		s9
50#define state10		a5
51#define state11		a6
52#define state12		a7
53#define state13		t0
54#define state14		t1
55#define state15		t2
56#define cnt_lo		t3
57#define cnt_hi		t4
58#define copy0		t5
59#define copy1		t6
60#define copy2		t7
61
62/* Reuse i as copy3 */
63#define copy3		i
64
65/* Packs to be used with OP_4REG */
66#define line0		state0, state1, state2, state3
67#define line1		state4, state5, state6, state7
68#define line2		state8, state9, state10, state11
69#define line3		state12, state13, state14, state15
70
71#define line1_perm	state5, state6, state7, state4
72#define line2_perm	state10, state11, state8, state9
73#define line3_perm	state15, state12, state13, state14
74
75#define copy		copy0, copy1, copy2, copy3
76
77#define _16		16, 16, 16, 16
78#define _20		20, 20, 20, 20
79#define _24		24, 24, 24, 24
80#define _25		25, 25, 25, 25
81
82	/*
83	 * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
84	 * This does not violate the stack-less requirement: no sensitive data
85	 * is spilled onto the stack.
86	 */
87	PTR_ADDI	sp, sp, (-SZREG * 10) & STACK_ALIGN
88	REG_S		s0, sp, 0
89	REG_S		s1, sp, SZREG
90	REG_S		s2, sp, SZREG * 2
91	REG_S		s3, sp, SZREG * 3
92	REG_S		s4, sp, SZREG * 4
93	REG_S		s5, sp, SZREG * 5
94	REG_S		s6, sp, SZREG * 6
95	REG_S		s7, sp, SZREG * 7
96	REG_S		s8, sp, SZREG * 8
97	REG_S		s9, sp, SZREG * 9
98
99	li.w		copy0, 0x61707865
100	li.w		copy1, 0x3320646e
101	li.w		copy2, 0x79622d32
102
103	ld.w		cnt_lo, counter, 0
104	ld.w		cnt_hi, counter, 4
105
106.Lblock:
107	/* state[0,1,2,3] = "expand 32-byte k" */
108	move		state0, copy0
109	move		state1, copy1
110	move		state2, copy2
111	li.w		state3, 0x6b206574
112
113	/* state[4,5,..,11] = key */
114	ld.w		state4, key, 0
115	ld.w		state5, key, 4
116	ld.w		state6, key, 8
117	ld.w		state7, key, 12
118	ld.w		state8, key, 16
119	ld.w		state9, key, 20
120	ld.w		state10, key, 24
121	ld.w		state11, key, 28
122
123	/* state[12,13] = counter */
124	move		state12, cnt_lo
125	move		state13, cnt_hi
126
127	/* state[14,15] = 0 */
128	move		state14, zero
129	move		state15, zero
130
131	li.w		i, 10
132.Lpermute:
133	/* odd round */
134	OP_4REG	add.w	line0, line1
135	OP_4REG	xor	line3, line0
136	OP_4REG	rotri.w	line3, _16
137
138	OP_4REG	add.w	line2, line3
139	OP_4REG	xor	line1, line2
140	OP_4REG	rotri.w	line1, _20
141
142	OP_4REG	add.w	line0, line1
143	OP_4REG	xor	line3, line0
144	OP_4REG	rotri.w	line3, _24
145
146	OP_4REG	add.w	line2, line3
147	OP_4REG	xor	line1, line2
148	OP_4REG	rotri.w	line1, _25
149
150	/* even round */
151	OP_4REG	add.w	line0, line1_perm
152	OP_4REG	xor	line3_perm, line0
153	OP_4REG	rotri.w	line3_perm, _16
154
155	OP_4REG	add.w	line2_perm, line3_perm
156	OP_4REG	xor	line1_perm, line2_perm
157	OP_4REG	rotri.w	line1_perm, _20
158
159	OP_4REG	add.w	line0, line1_perm
160	OP_4REG	xor	line3_perm, line0
161	OP_4REG	rotri.w	line3_perm, _24
162
163	OP_4REG	add.w	line2_perm, line3_perm
164	OP_4REG	xor	line1_perm, line2_perm
165	OP_4REG	rotri.w	line1_perm, _25
166
167	addi.w		i, i, -1
168	bnez		i, .Lpermute
169
170	/*
171	 * copy[3] = "expa", materialize it here because copy[3] shares the
172	 * same register with i which just became dead.
173	 */
174	li.w		copy3, 0x6b206574
175
176	/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
177	OP_4REG	add.w	line0, copy
178	st.w		state0, output, 0
179	st.w		state1, output, 4
180	st.w		state2, output, 8
181	st.w		state3, output, 12
182
183	/* from now on state[0,1,2,3] are scratch registers  */
184
185	/* state[0,1,2,3] = lo32(key) */
186	ld.w		state0, key, 0
187	ld.w		state1, key, 4
188	ld.w		state2, key, 8
189	ld.w		state3, key, 12
190
191	/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
192	OP_4REG	add.w	line1, line0
193	st.w		state4, output, 16
194	st.w		state5, output, 20
195	st.w		state6, output, 24
196	st.w		state7, output, 28
197
198	/* state[0,1,2,3] = hi32(key) */
199	ld.w		state0, key, 16
200	ld.w		state1, key, 20
201	ld.w		state2, key, 24
202	ld.w		state3, key, 28
203
204	/* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
205	OP_4REG	add.w	line2, line0
206	st.w		state8, output, 32
207	st.w		state9, output, 36
208	st.w		state10, output, 40
209	st.w		state11, output, 44
210
211	/* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
212	add.w		state12, state12, cnt_lo
213	add.w		state13, state13, cnt_hi
214	st.w		state12, output, 48
215	st.w		state13, output, 52
216	st.w		state14, output, 56
217	st.w		state15, output, 60
218
219	/* ++counter  */
220	addi.w		cnt_lo, cnt_lo, 1
221	sltui		state0, cnt_lo, 1
222	add.w		cnt_hi, cnt_hi, state0
223
224	/* output += 64 */
225	PTR_ADDI	output, output, 64
226	/* --nblocks */
227	PTR_ADDI	nblocks, nblocks, -1
228	bnez		nblocks, .Lblock
229
230	/* counter = [cnt_lo, cnt_hi] */
231	st.w		cnt_lo, counter, 0
232	st.w		cnt_hi, counter, 4
233
234	/*
235	 * Zero out the potentially sensitive regs, in case nothing uses these
236	 * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
237	 * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we
238	 * only need to zero state[11,...,15].
239	 */
240	move		state10, zero
241	move		state11, zero
242	move		state12, zero
243	move		state13, zero
244	move		state14, zero
245	move		state15, zero
246
247	REG_L		s0, sp, 0
248	REG_L		s1, sp, SZREG
249	REG_L		s2, sp, SZREG * 2
250	REG_L		s3, sp, SZREG * 3
251	REG_L		s4, sp, SZREG * 4
252	REG_L		s5, sp, SZREG * 5
253	REG_L		s6, sp, SZREG * 6
254	REG_L		s7, sp, SZREG * 7
255	REG_L		s8, sp, SZREG * 8
256	REG_L		s9, sp, SZREG * 9
257	PTR_ADDI	sp, sp, -((-SZREG * 10) & STACK_ALIGN)
258
259	jr		ra
260SYM_FUNC_END(__arch_chacha20_blocks_nostack)
261