alpha/lib/ev6-memcpy.S

1 /* SPDX-License-Identifier: GPL-2.0 */
3  * arch/alpha/lib/ev6-memcpy.S
4  * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
8  *	- memory accessed as aligned quadwords only
9  *	- uses bcmpge to compare 8 bytes in parallel
14  *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
16  *	E	- either cluster
17  *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
18  *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
21  *	$1,$2,		- scratch
34 	mov	$16, $0			# E : copy dest to return
36 	xor	$16, $17, $1		# E : are source and dest alignments the same?
37 	and	$1, 7, $1		# E : are they the same mod 8?
39 	bne	$1, $misaligned		# U : Nope - gotta do this the slow way
41 	and	$16, 7, $1		# E : Are both 0mod8?
42 	beq	$1, $both_0mod8		# U : Yes
52 	ldbu	$1, 0($17)		# L : grab a byte
53 	subq	$18, 1, $18		# E : count--
54 	addq	$17, 1, $17		# E : src++
55 	stb	$1, 0($16)		# L :
56 	addq	$16, 1, $16		# E : dest++
57 	and	$16, 7, $1		# E : Are we at 0mod8 yet?
59 	bne	$1, $head_align		# U :
62 	cmple	$18, 127, $1		# E : Can we unroll the loop?
63 	bne	$1, $no_unroll		# U :
64 	and	$16, 63, $1		# E : get mod64 alignment
65 	beq	$1, $do_unroll		# U : no single quads to fiddle
68 	ldq	$1, 0($17)		# L : get 8 bytes
69 	subq	$18, 8, $18		# E : count -= 8
73 	stq	$1, 0($16)		# L : store
74 	addq	$16, 8, $16		# E : dest += 8
75 	and	$16, 63, $1		# E : get mod64 alignment
76 	bne	$1, $single_head_quad	# U : still not fully aligned
79 	addq	$16, 64, $7		# E : Initial (+1 trip) wh64 address
80 	cmple	$18, 127, $1		# E : Can we go through the unrolled loop?
81 	bne	$1, $tail_quads		# U : Nope
86 					# ($7) are about to be over-written
92 	ldq	$5, 16($17)		# L : bytes 16..23
97 	addq	$16, 64, $1		# E : fallback value for wh64
102 	stq	$6, 0($16)		# L : bytes 0..7
106 	stq	$4, 8($16)		# L : bytes 8..15
107 	stq	$5, 16($16)		# L : bytes 16..23
111 	stq	$3, 24($16)		# L : bytes 24..31
112 	addq	$16, 32, $16		# E : dest += 32 bytes
118 	cmovlt	$2, $1, $7		# E : Latency 2, extra map slot - Use
122 	ldq	$5, 16($17)		# L : bytes 16..23
124 	addq	$16, 32, $16		# E : dest += 32
125 	subq	$18, 64, $18		# E : count -= 64
128 	stq	$6, -32($16)		# L : bytes 0..7
129 	stq	$4, -24($16)		# L : bytes 8..15
130 	cmple	$18, 63, $1		# E : At least one more trip?
132 	stq	$5, -16($16)		# L : bytes 16..23
133 	stq	$3, -8($16)		# L : bytes 24..31
135 	beq	$1, $unroll_body
146 	ldq	$1, 0($17)		# L : fetch 8
147 	subq	$18, 8, $18		# E : count -= 8
151 	stq	$1, 0($16)		# L : store 8
152 	addq	$16, 8, $16		# E : dest += 8
159 	ble	$18, $nomoredata	# U : All-done
165 	subq	$18, 1, $18		# E : count--
166 	ldbu	$1, 0($17)		# L : fetch a byte
167 	addq	$17, 1, $17		# E : src++
170 	stb	$1, 0($16)		# L : store a byte
171 	addq	$16, 1, $16		# E : dest++
176 	ret	$31, ($26), 1		# L0 :
183 	and	$0, 7, $1		# E : dest alignment mod8
184 	beq	$1, $dest_0mod8		# U : life doesnt totally suck
189 	ldbu	$1, 0($17)		# L : fetch a byte
190 	subq	$18, 1, $18		# E : count--
191 	addq	$17, 1, $17		# E : src++
193 	stb	$1, 0($4)		# L : store it
194 	addq	$4, 1, $4		# E : dest++
195 	and	$4, 7, $1		# E : dest 0mod8 yet?
196 	bne	$1, $aligndest		# U : go until we are aligned.
206 	ldq_u	$16, 8($17)		# L : Fetch next 8
208 	extqh	$16, $17, $1		# U : masking
209 	bis	$3, $1, $1		# E : merged bytes to store
211 	subq	$18, 8, $18		# E : count -= 8
213 	stq	$1, 0($4)		# L : store 8 (aligned)
214 	mov	$16, $3			# E : "rotate" source data
228 	ldbu	$1, 0($17)		# L : fetch 1
229 	subq	$18, 1, $18		# E : count--
230 	addq	$17, 1, $17		# E : src++
233 	stb	$1, 0($4)		# L : store
234 	addq	$4, 1, $4		# E : dest++
240 	ret	$31, ($26), 1		# L0 :