Lines Matching +full:4 +full:- +full:16

1 // SPDX-License-Identifier: GPL-2.0
27 * number of blocks of output with nonce 0, taking an input key and 8-bytes
30 * This implementation avoids d8-d15 because they are callee-save in user
39 * x1: 32-byte key input
40 * x2: 8-byte counter input/output
41 * x3: number of 64-byte block to write to output
45 /* copy0 = "expand 32-byte k" */
52 ld1 { copy1.4s, copy2.4s }, [x1]
57 uzp1 one_v.4s, one_v.4s, one_v.4s
61 mov state0.16b, copy0.16b
62 mov state1.16b, copy1.16b
63 mov state2.16b, copy2.16b
64 mov state3.16b, copy3.16b
69 * Permute one 64-byte block where the state matrix is stored in the four NEON
70 * registers state0-state3. It performs matrix operations on four words in parallel,
75 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
76 add state0.4s, state0.4s, state1.4s
77 eor state3.16b, state3.16b, state0.16b
81 add state2.4s, state2.4s, state3.4s
82 eor tmp.16b, state1.16b, state2.16b
83 shl state1.4s, tmp.4s, #12
84 sri state1.4s, tmp.4s, #20
87 add state0.4s, state0.4s, state1.4s
88 eor tmp.16b, state3.16b, state0.16b
89 shl state3.4s, tmp.4s, #8
90 sri state3.4s, tmp.4s, #24
93 add state2.4s, state2.4s, state3.4s
94 eor tmp.16b, state1.16b, state2.16b
95 shl state1.4s, tmp.4s, #7
96 sri state1.4s, tmp.4s, #25
99 ext state1.16b, state1.16b, state1.16b, #4
101 ext state2.16b, state2.16b, state2.16b, #8
103 ext state3.16b, state3.16b, state3.16b, #12
105 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
106 add state0.4s, state0.4s, state1.4s
107 eor state3.16b, state3.16b, state0.16b
111 add state2.4s, state2.4s, state3.4s
112 eor tmp.16b, state1.16b, state2.16b
113 shl state1.4s, tmp.4s, #12
114 sri state1.4s, tmp.4s, #20
117 add state0.4s, state0.4s, state1.4s
118 eor tmp.16b, state3.16b, state0.16b
119 shl state3.4s, tmp.4s, #8
120 sri state3.4s, tmp.4s, #24
123 add state2.4s, state2.4s, state3.4s
124 eor tmp.16b, state1.16b, state2.16b
125 shl state1.4s, tmp.4s, #7
126 sri state1.4s, tmp.4s, #25
129 ext state1.16b, state1.16b, state1.16b, #12
131 ext state2.16b, state2.16b, state2.16b, #8
133 ext state3.16b, state3.16b, state3.16b, #4
139 add state0.4s, state0.4s, copy0.4s
141 add state1.4s, state1.4s, copy1.4s
143 add state2.4s, state2.4s, copy2.4s
145 add state3.4s, state3.4s, copy3.4s
146 st1 { state0.16b - state3.16b }, [x0]
154 /* output += 64, --nblocks */
163 movi state0.16b, #0
164 movi state1.16b, #0
165 movi state2.16b, #0
166 movi state3.16b, #0
167 movi copy1.16b, #0
168 movi copy2.16b, #0