1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4 */
5
6#include <linux/linkage.h>
7#include <asm/frame.h>
8
9.section	.rodata, "a"
10.align 16
11CONSTANTS:	.octa 0x6b20657479622d323320646e61707865
12.text
13
14/*
15 * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
16 * of blocks of output with a nonce of 0, taking an input key and 8-byte
17 * counter. Importantly does not spill to the stack. Its arguments are:
18 *
19 *	rdi: output bytes
20 *	rsi: 32-byte key input
21 *	rdx: 8-byte counter input/output
22 *	rcx: number of 64-byte blocks to write to output
23 */
24SYM_FUNC_START(__arch_chacha20_blocks_nostack)
25
26.set	output,		%rdi
27.set	key,		%rsi
28.set	counter,	%rdx
29.set	nblocks,	%rcx
30.set	i,		%al
31/* xmm registers are *not* callee-save. */
32.set	temp,		%xmm0
33.set	state0,		%xmm1
34.set	state1,		%xmm2
35.set	state2,		%xmm3
36.set	state3,		%xmm4
37.set	copy0,		%xmm5
38.set	copy1,		%xmm6
39.set	copy2,		%xmm7
40.set	copy3,		%xmm8
41.set	one,		%xmm9
42
43	/* copy0 = "expand 32-byte k" */
44	movaps		CONSTANTS(%rip),copy0
45	/* copy1,copy2 = key */
46	movups		0x00(key),copy1
47	movups		0x10(key),copy2
48	/* copy3 = counter || zero nonce */
49	movq		0x00(counter),copy3
50	/* one = 1 || 0 */
51	movq		$1,%rax
52	movq		%rax,one
53
54.Lblock:
55	/* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
56	movdqa		copy0,state0
57	movdqa		copy1,state1
58	movdqa		copy2,state2
59	movdqa		copy3,state3
60
61	movb		$10,i
62.Lpermute:
63	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
64	paddd		state1,state0
65	pxor		state0,state3
66	movdqa		state3,temp
67	pslld		$16,temp
68	psrld		$16,state3
69	por		temp,state3
70
71	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
72	paddd		state3,state2
73	pxor		state2,state1
74	movdqa		state1,temp
75	pslld		$12,temp
76	psrld		$20,state1
77	por		temp,state1
78
79	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
80	paddd		state1,state0
81	pxor		state0,state3
82	movdqa		state3,temp
83	pslld		$8,temp
84	psrld		$24,state3
85	por		temp,state3
86
87	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
88	paddd		state3,state2
89	pxor		state2,state1
90	movdqa		state1,temp
91	pslld		$7,temp
92	psrld		$25,state1
93	por		temp,state1
94
95	/* state1[0,1,2,3] = state1[1,2,3,0] */
96	pshufd		$0x39,state1,state1
97	/* state2[0,1,2,3] = state2[2,3,0,1] */
98	pshufd		$0x4e,state2,state2
99	/* state3[0,1,2,3] = state3[3,0,1,2] */
100	pshufd		$0x93,state3,state3
101
102	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
103	paddd		state1,state0
104	pxor		state0,state3
105	movdqa		state3,temp
106	pslld		$16,temp
107	psrld		$16,state3
108	por		temp,state3
109
110	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
111	paddd		state3,state2
112	pxor		state2,state1
113	movdqa		state1,temp
114	pslld		$12,temp
115	psrld		$20,state1
116	por		temp,state1
117
118	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
119	paddd		state1,state0
120	pxor		state0,state3
121	movdqa		state3,temp
122	pslld		$8,temp
123	psrld		$24,state3
124	por		temp,state3
125
126	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
127	paddd		state3,state2
128	pxor		state2,state1
129	movdqa		state1,temp
130	pslld		$7,temp
131	psrld		$25,state1
132	por		temp,state1
133
134	/* state1[0,1,2,3] = state1[3,0,1,2] */
135	pshufd		$0x93,state1,state1
136	/* state2[0,1,2,3] = state2[2,3,0,1] */
137	pshufd		$0x4e,state2,state2
138	/* state3[0,1,2,3] = state3[1,2,3,0] */
139	pshufd		$0x39,state3,state3
140
141	decb		i
142	jnz		.Lpermute
143
144	/* output0 = state0 + copy0 */
145	paddd		copy0,state0
146	movups		state0,0x00(output)
147	/* output1 = state1 + copy1 */
148	paddd		copy1,state1
149	movups		state1,0x10(output)
150	/* output2 = state2 + copy2 */
151	paddd		copy2,state2
152	movups		state2,0x20(output)
153	/* output3 = state3 + copy3 */
154	paddd		copy3,state3
155	movups		state3,0x30(output)
156
157	/* ++copy3.counter */
158	paddq		one,copy3
159
160	/* output += 64, --nblocks */
161	addq		$64,output
162	decq		nblocks
163	jnz		.Lblock
164
165	/* counter = copy3.counter */
166	movq		copy3,0x00(counter)
167
168	/* Zero out the potentially sensitive regs, in case nothing uses these again. */
169	pxor		state0,state0
170	pxor		state1,state1
171	pxor		state2,state2
172	pxor		state3,state3
173	pxor		copy1,copy1
174	pxor		copy2,copy2
175	pxor		temp,temp
176
177	ret
178SYM_FUNC_END(__arch_chacha20_blocks_nostack)
179