xtensa/lib/memcopy.S

2  * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
9  * Copyright (C) 2002 - 2012 Tensilica Inc.
24  * 32-bit load and store instructions (as required for these
39  *   This code tries to use fall-through branches for the common
64 	.byte	0		# 1 mod 4 alignment for LOOPNEZ
65 				# (0 mod 4 alignment for LBEG)
95 	addi	a4, a4, -1
100 .Ldst2mod4:	# dst 16-bit aligned
106 	addi	a4, a4, -2
119 	_bbsi.l	a2, 0, .Ldst1mod2	# if dst is 1 mod 2
120 	_bbsi.l	a2, 1, .Ldst2mod4	# if dst is 2 mod 4
121 .Ldstaligned:	# return here from .Ldst?mod? once dst is aligned
122 	srli	a7, a4, 4	# number of loop iterations with 16B
127 	 * Destination and source are word-aligned, use word copy.
129 	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
135 	add	a8, a8, a3	# a8 = end of last 16B source chunk
143 	l32i	a7, a3, 12
146 	s32i	a7, a5, 12
194 	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
195 	# copy 16 bytes per iteration for word-aligned dst and unaligned src
211 	add	a10, a10, a3	# a10 = end of last 16B source chunk
218 	l32i	a9, a3, 12
226 	s32i	a9, a5, 12
287  * 32-bit load and store instructions (as required for these
317 	.byte	0		# 1 mod 4 alignment for LOOPNEZ
318 				# (0 mod 4 alignment for LBEG)
327 	addi	a3, a3, -1
329 	addi	a5, a5, -1
347 	addi	a3, a3, -1
349 	addi	a5, a5, -1
351 	addi	a4, a4, -1
354 .Lbackdst2mod4:	# dst 16-bit aligned
357 	addi	a3, a3, -2
360 	addi	a5, a5, -2
363 	addi	a4, a4, -2
380 	_bbsi.l	a5, 0, .Lbackdst1mod2	# if dst is 1 mod 2
381 	_bbsi.l	a5, 1, .Lbackdst2mod4	# if dst is 2 mod 4
382 .Lbackdstaligned:	# return here from .Lbackdst?mod? once dst is aligned
383 	srli	a7, a4, 4	# number of loop iterations with 16B
388 	 * Destination and source are word-aligned, use word copy.
390 	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
396 	sub	a8, a3, a8	# a8 = start of first 16B source chunk
399 	addi	a3, a3, -16
400 	l32i	a7, a3, 12
402 	addi	a5, a5, -16
403 	s32i	a7, a5, 12
415 	addi	a3, a3, -8
418 	addi	a5, a5, -8
428 	addi	a3, a3, -4
430 	addi	a5, a5, -4
437 	addi	a3, a3, -2
439 	addi	a5, a5, -2
445 	addi	a3, a3, -1
447 	addi	a5, a5, -1
457 	_beqz	a4, .Lbackdone	# avoid loading anything for zero-length copies
458 	# copy 16 bytes per iteration for word-aligned dst and unaligned src
473 	sub	a10, a3, a10	# a10 = start of first 16B source chunk
476 	addi	a3, a3, -16
477 	l32i	a7, a3, 12
479 	addi	a5, a5, -16
481 	s32i	a6, a5, 12
496 	addi	a3, a3, -8
499 	addi	a5, a5, -8
508 	addi	a3, a3, -4
510 	addi	a5, a5, -4
524 	addi	a3, a3, -2
527 	addi	a5, a5, -2
534 	addi	a3, a3, -1
535 	addi	a5, a5, -1