alpha/lib/ev6-memset.S

1 /* SPDX-License-Identifier: GPL-2.0 */
3  * arch/alpha/lib/ev6-memset.S
5  * This is an efficient (and relatively small) implementation of the C library
6  * "memset()" function for the 21264 implementation of Alpha.
8  * 21264 version  contributed by Rick Gorton <rick.gorton@alpha-processor.com>
10  * Much of the information about 21264 scheduling/coding comes from:
13  *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
15  *	E	- either cluster
16  *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
17  *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
20  * and the code has been replicated for each of the entry points: __memset
48 	 * undertake a major re-write to interleave the constant materialization
49 	 * with other parts of the fall-through code.  This is important, even
53 	and $17,255,$1		# E : 00000000000000ch
54 	insbl $17,1,$2		# U : 000000000000ch00
59 	bis	$1,$2,$17	# E : 000000000000chch
60 	insbl	$1,2,$3		# U : 0000000000ch0000
61 	insbl	$1,3,$4		# U : 00000000ch000000
63 	or	$3,$4,$3	# E : 00000000chch0000
64 	inswl	$17,4,$5	# U : 0000chch00000000
65 	xor	$16,$6,$1	# E : will complete write be within one quadword?
70 	bic	$1,7,$1		# E : fit within a single quadword?
74 	beq	$1,within_quad_b # U :
81 	ldq_u $4,0($16)		# L : Fetch first partial
87 	mskql $4,$16,$4		# U : clear relevant parts of the quad
89 	bis $2,$4,$1		# E : Final bytes
92 	stq_u $1,0($5)		# L : Store result
96 .align 4
103 	sra $18,3,$3		# U : Number of remaining quads to write
104 	and $18,7,$18		# E : Number of trailing bytes to write
110 	 * Lifted a bunch of code from clear_user.S
113 	 * $5	A copy of $16
120 	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
121 	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
122 	blt	$4, loop_b	# U :
125 	 * We know we've got at least 16 quads, minimum of one trip
133 	beq	$1, $bigalign_b	# U :
137 	subq	$3, 1, $3	# E : For consistency later
138 	addq	$1, 8, $1	# E : Increment towards zero for alignment
139 	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
144 	blt	$1, $alignmod64_b # U :
148 	 * $3 - number quads left to go
149 	 * $5 - target address (aligned 0mod64)
150 	 * $17 - mask of stuff to store
151 	 * Scratch registers available: $7, $2, $4, $1
152 	 * we know that we'll be taking a minimum of one trip through
153  	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
161 	wh64	($4)		# L1 : memory subsystem write hint
166 	addq	$5, 128, $4	# E : speculative target of next wh64
173 	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
191 .align 4
194 	 * of data (where we can't use an unrolled loop and wh64)
198 	subq $3,1,$3		# E : Decrement number quads left
211 	insqh $17,$6,$4		# U : New bits
212 	bis $2,$4,$1		# E : Put it all together
213 	stq $1,0($5)		# L : And back to memory
214 	ret $31,($26),1		# L0 :
217 	ldq_u $1,0($16)		# L :
219 	mskql $1,$16,$4		# U : Clear old
220 	bis $2,$4,$2		# E : New result
222 	mskql $2,$6,$4		# U :
223 	mskqh $1,$6,$2		# U :
224 	bis $2,$4,$1		# E :
225 	stq_u $1,0($16)		# L :
231 	ret $31,($26),1		# L0 :
236 	 * This is the original body of code, prior to replication and
240 .align 4
248 	xor $16,$6,$1		# E : will complete write be within one quadword?
251 	bic $1,7,$1		# E : fit within a single quadword
252 	beq $1,within_one_quad	# U :
259 	ldq_u $4,0($16)		# L : Fetch first partial
265 	mskql $4,$16,$4		# U : clear relevant parts of the quad
267 	bis $2,$4,$1		# E : Final bytes
270 	stq_u $1,0($5)		# L : Store result
274 .align 4
281 	sra $18,3,$3		# U : Number of remaining quads to write
282 	and $18,7,$18		# E : Number of trailing bytes to write
288 	 * Lifted a bunch of code from clear_user.S
291 	 * $5	A copy of $16
298 	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
299 	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
300 	blt	$4, loop	# U :
303 	 * We know we've got at least 16 quads, minimum of one trip
311 	beq	$1, $bigalign	# U :
315 	subq	$3, 1, $3	# E : For consistency later
316 	addq	$1, 8, $1	# E : Increment towards zero for alignment
317 	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
322 	blt	$1, $alignmod64	# U :
326 	 * $3 - number quads left to go
327 	 * $5 - target address (aligned 0mod64)
328 	 * $17 - mask of stuff to store
329 	 * Scratch registers available: $7, $2, $4, $1
330 	 * we know that we'll be taking a minimum of one trip through
331  	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
339 	wh64	($4)		# L1 : memory subsystem write hint
344 	addq	$5, 128, $4	# E : speculative target of next wh64
351 	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
369 .align 4
372 	 * of data (where we can't use an unrolled loop and wh64)
376 	subq $3,1,$3		# E : Decrement number quads left
389 	insqh $17,$6,$4		# U : New bits
390 	bis $2,$4,$1		# E : Put it all together
391 	stq $1,0($5)		# L : And back to memory
392 	ret $31,($26),1		# L0 :
395 	ldq_u $1,0($16)		# L :
397 	mskql $1,$16,$4		# U : Clear old
398 	bis $2,$4,$2		# E : New result
400 	mskql $2,$6,$4		# U :
401 	mskqh $1,$6,$2		# U :
402 	bis $2,$4,$1		# E :
403 	stq_u $1,0($16)		# L :
409 	ret $31,($26),1		# L0 :
414 	 * This is a replicant of the __constant_c_memset code, rescheduled
430 	inswl	$17,4,$3	# U : 0000c1c200000000
431 	inswl	$17,6,$4	# U : c1c2000000000000
432 	xor	$16,$6,$1	# E : will complete write be within one quadword?
435 	or	$3,$4,$17	# E : c1c2c1c200000000
436 	bic	$1,7,$1		# E : fit within a single quadword
440 	beq $1,within_quad_w	# U :
447 	ldq_u $4,0($16)		# L : Fetch first partial
453 	mskql $4,$16,$4		# U : clear relevant parts of the quad
455 	bis $2,$4,$1		# E : Final bytes
458 	stq_u $1,0($5)		# L : Store result
462 .align 4
469 	sra $18,3,$3		# U : Number of remaining quads to write
470 	and $18,7,$18		# E : Number of trailing bytes to write
476 	 * Lifted a bunch of code from clear_user.S
479 	 * $5	A copy of $16
486 	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
487 	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
488 	blt	$4, loop_w	# U :
491 	 * We know we've got at least 16 quads, minimum of one trip
499 	beq	$1, $bigalign_w	# U :
503 	subq	$3, 1, $3	# E : For consistency later
504 	addq	$1, 8, $1	# E : Increment towards zero for alignment
505 	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
510 	blt	$1, $alignmod64_w	# U :
514 	 * $3 - number quads left to go
515 	 * $5 - target address (aligned 0mod64)
516 	 * $17 - mask of stuff to store
517 	 * Scratch registers available: $7, $2, $4, $1
518 	 * we know that we'll be taking a minimum of one trip through
519  	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
527 	wh64	($4)		# L1 : memory subsystem write hint
532 	addq	$5, 128, $4	# E : speculative target of next wh64
539 	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
557 .align 4
560 	 * of data (where we can't use an unrolled loop and wh64)
564 	subq $3,1,$3		# E : Decrement number quads left
577 	insqh $17,$6,$4		# U : New bits
578 	bis $2,$4,$1		# E : Put it all together
579 	stq $1,0($5)		# L : And back to memory
580 	ret $31,($26),1		# L0 :
583 	ldq_u $1,0($16)		# L :
585 	mskql $1,$16,$4		# U : Clear old
586 	bis $2,$4,$2		# E : New result
588 	mskql $2,$6,$4		# U :
589 	mskqh $1,$6,$2		# U :
590 	bis $2,$4,$1		# E :
591 	stq_u $1,0($16)		# L :
597 	ret $31,($26),1		# L0 :