libc/arch-mips/string/memcpy.S - platform/bionic - Git at Google

 /*
  * Copyright (c) 2009
  *      MIPS Technologies, Inc., California.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
  *    contributors may be used to endorse or promote products derived from
  *    this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */

 /************************************************************************
  *
  *  memcpy.S
  *  Version: "043009"
  *
  ************************************************************************/


 /************************************************************************
  *  Include files
  ************************************************************************/

 #include "machine/asm.h"


 /*
  * This routine could be optimized for MIPS64. The current code only
  * uses MIPS32 instructions.
  */
 #if defined(__MIPSEB__)
 #  define LWHI	lwl		/* high part is left in big-endian	*/
 #  define SWHI	swl		/* high part is left in big-endian	*/
 #  define LWLO	lwr		/* low part is right in big-endian	*/
 #  define SWLO	swr		/* low part is right in big-endian	*/
 #endif

 #if defined(__MIPSEL__)
 #  define LWHI	lwr		/* high part is right in little-endian	*/
 #  define SWHI	swr		/* high part is right in little-endian	*/
 #  define LWLO	lwl		/* low part is left in big-endian	*/
 #  define SWLO	swl		/* low part is left in big-endian	*/
 #endif

 LEAF(memcpy,0)

 	.set	noreorder
 	.set	noat
 /*
  * Below we handle the case where memcpy is called with overlapping src and dst.
  * Although memcpy is not required to handle this case, some parts of Android like Skia
  * rely on such usage. We call memmove to handle such cases.
  */
 	subu	t0,a0,a1
 	sra	AT,t0,31
 	xor	t1,t0,AT
 	subu	t0,t1,AT
 	sltu	AT,t0,a2
 	beq	AT,zero,.Lmemcpy
 	 la	t9,memmove
 	jr	t9
 	 nop
 .Lmemcpy:
 	slti	AT,a2,8
 	bne	AT,zero,.Llast8
 	 move	v0,a0	# memcpy returns the dst pointer

 # Test if the src and dst are word-aligned, or can be made word-aligned
 	xor	t8,a1,a0
 	andi	t8,t8,0x3		# t8 is a0/a1 word-displacement

 	bne	t8,zero,.Lunaligned
 	 negu	a3,a0

 	andi	a3,a3,0x3	# we need to copy a3 bytes to make a0/a1 aligned
 	beq	a3,zero,.Lchk16w # when a3=0 then the dst (a0) is word-aligned
 	 subu	a2,a2,a3	# now a2 is the remining bytes count

 	LWHI	t8,0(a1)
 	addu	a1,a1,a3
 	SWHI	t8,0(a0)
 	addu	a0,a0,a3

 # Now the dst/src are mutually word-aligned with word-aligned addresses
 .Lchk16w:
 	andi	t8,a2,0x3f	# any whole 64-byte chunks?
 				# t8 is the byte count after 64-byte chunks

 	beq	a2,t8,.Lchk8w	# if a2==t8, no 64-byte chunks
 				# There will be at most 1 32-byte chunk after it
 	 subu	a3,a2,t8	# subtract from a2 the reminder
                                 # Here a3 counts bytes in 16w chunks
 	addu	a3,a0,a3	# Now a3 is the final dst after 64-byte chunks

 	addu	t0,a0,a2	# t0 is the "past the end" address

 # When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
 # the "t0-32" address
 # This means: for x=128 the last "safe" a0 address is "t0-160"
 # Alternatively, for x=64 the last "safe" a0 address is "t0-96"
 # In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
 	subu	t9,t0,160	# t9 is the "last safe pref 30,128(a0)" address

 	pref    0,0(a1)		# bring the first line of src, addr 0
 	pref    0,32(a1)	# bring the second line of src, addr 32
 	pref    0,64(a1)	# bring the third line of src, addr 64
 	pref	30,32(a0)	# safe, as we have at least 64 bytes ahead
 # In case the a0 > t9 don't use "pref 30" at all
 	sgtu	v1,a0,t9
 	bgtz	v1,.Lloop16w	# skip "pref 30,64(a0)" for too short arrays
 	 nop
 # otherwise, start with using pref30
 	pref	30,64(a0)
 .Lloop16w:
 	pref	0,96(a1)
 	lw	t0,0(a1)
 	bgtz	v1,.Lskip_pref30_96	# skip "pref 30,96(a0)"
 	 lw	t1,4(a1)
 	pref    30,96(a0)   # continue setting up the dest, addr 96
 .Lskip_pref30_96:
 	lw	t2,8(a1)
 	lw	t3,12(a1)
 	lw	t4,16(a1)
 	lw	t5,20(a1)
 	lw	t6,24(a1)
 	lw	t7,28(a1)
         pref    0,128(a1)    # bring the next lines of src, addr 128

 	sw	t0,0(a0)
 	sw	t1,4(a0)
 	sw	t2,8(a0)
 	sw	t3,12(a0)
 	sw	t4,16(a0)
 	sw	t5,20(a0)
 	sw	t6,24(a0)
 	sw	t7,28(a0)

 	lw	t0,32(a1)
 	bgtz	v1,.Lskip_pref30_128	# skip "pref 30,128(a0)"
 	 lw	t1,36(a1)
 	pref    30,128(a0)   # continue setting up the dest, addr 128
 .Lskip_pref30_128:
 	lw	t2,40(a1)
 	lw	t3,44(a1)
 	lw	t4,48(a1)
 	lw	t5,52(a1)
 	lw	t6,56(a1)
 	lw	t7,60(a1)
         pref    0, 160(a1)    # bring the next lines of src, addr 160

 	sw	t0,32(a0)
 	sw	t1,36(a0)
 	sw	t2,40(a0)
 	sw	t3,44(a0)
 	sw	t4,48(a0)
 	sw	t5,52(a0)
 	sw	t6,56(a0)
 	sw	t7,60(a0)

 	addiu	a0,a0,64	# adding 64 to dest
 	sgtu	v1,a0,t9
 	bne	a0,a3,.Lloop16w
 	 addiu	a1,a1,64	# adding 64 to src
 	move	a2,t8

 # Here we have src and dest word-aligned but less than 64-bytes to go

 .Lchk8w:
 	pref 0, 0x0(a1)
 	andi	t8,a2,0x1f	# is there a 32-byte chunk?
 				# the t8 is the reminder count past 32-bytes
 	beq	a2,t8,.Lchk1w	# when a2=t8, no 32-byte chunk
 	 nop

 	lw	t0,0(a1)
 	lw	t1,4(a1)
 	lw	t2,8(a1)
 	lw	t3,12(a1)
 	lw	t4,16(a1)
 	lw	t5,20(a1)
 	lw	t6,24(a1)
 	lw	t7,28(a1)
 	addiu	a1,a1,32

 	sw	t0,0(a0)
 	sw	t1,4(a0)
 	sw	t2,8(a0)
 	sw	t3,12(a0)
 	sw	t4,16(a0)
 	sw	t5,20(a0)
 	sw	t6,24(a0)
 	sw	t7,28(a0)
 	addiu	a0,a0,32

 .Lchk1w:
 	andi	a2,t8,0x3	# now a2 is the reminder past 1w chunks
 	beq	a2,t8,.Llast8
 	 subu	a3,t8,a2	# a3 is count of bytes in 1w chunks
 	addu	a3,a0,a3	# now a3 is the dst address past the 1w chunks

 # copying in words (4-byte chunks)
 .LwordCopy_loop:
 	lw	t3,0(a1)	# the first t3 may be equal t0 ... optimize?
 	addiu	a1,a1,4
 	addiu	a0,a0,4
 	bne	a0,a3,.LwordCopy_loop
 	 sw	t3,-4(a0)

 # For the last (<8) bytes
 .Llast8:
 	blez	a2,.Lleave
 	 addu	a3,a0,a2	# a3 is the last dst address
 .Llast8loop:
 	lb	v1,0(a1)
 	addiu	a1,a1,1
 	addiu	a0,a0,1
 	bne	a0,a3,.Llast8loop
 	 sb	v1,-1(a0)

 .Lleave:
 	j	ra
 	 nop

 #
 # UNALIGNED case
 #

 .Lunaligned:
 	# got here with a3="negu a0"
 	andi	a3,a3,0x3	# test if the a0 is word aligned
 	beqz	a3,.Lua_chk16w
 	 subu	a2,a2,a3	# bytes left after initial a3 bytes

 	LWHI	v1,0(a1)
 	LWLO	v1,3(a1)
 	addu	a1,a1,a3	# a3 may be here 1, 2 or 3
 	SWHI	v1,0(a0)
 	addu	a0,a0,a3	# below the dst will be word aligned (NOTE1)

 .Lua_chk16w:
 	andi	t8,a2,0x3f	# any whole 64-byte chunks?
 				# t8 is the byte count after 64-byte chunks
 	beq	a2,t8,.Lua_chk8w # if a2==t8, no 64-byte chunks
 				# There will be at most 1 32-byte chunk after it
 	 subu	a3,a2,t8	# subtract from a2 the reminder
                                 # Here a3 counts bytes in 16w chunks
 	addu	a3,a0,a3	# Now a3 is the final dst after 64-byte chunks

 	addu	t0,a0,a2	# t0 is the "past the end" address

 	subu	t9,t0,160	# t9 is the "last safe pref 30,128(a0)" address

 	pref    0,0(a1)		# bring the first line of src, addr 0
 	pref    0,32(a1)	# bring the second line of src, addr 32
 	pref    0,64(a1)	# bring the third line of src, addr 64
 	pref	30,32(a0)	# safe, as we have at least 64 bytes ahead
 # In case the a0 > t9 don't use "pref 30" at all
 	sgtu	v1,a0,t9
 	bgtz	v1,.Lua_loop16w	# skip "pref 30,64(a0)" for too short arrays
 	 nop
 # otherwise, start with using pref30
 	pref	30,64(a0)
 .Lua_loop16w:
 	pref	0,96(a1)
 	LWHI	t0,0(a1)
 	LWLO	t0,3(a1)
 	LWHI	t1,4(a1)
 	bgtz	v1,.Lua_skip_pref30_96
 	 LWLO	t1,7(a1)
 	pref    30,96(a0)   # continue setting up the dest, addr 96
 .Lua_skip_pref30_96:
 	LWHI	t2,8(a1)
 	LWLO	t2,11(a1)
 	LWHI	t3,12(a1)
 	LWLO	t3,15(a1)
 	LWHI	t4,16(a1)
 	LWLO	t4,19(a1)
 	LWHI	t5,20(a1)
 	LWLO	t5,23(a1)
 	LWHI	t6,24(a1)
 	LWLO	t6,27(a1)
 	LWHI	t7,28(a1)
 	LWLO	t7,31(a1)
         pref    0,128(a1)    # bring the next lines of src, addr 128

 	sw	t0,0(a0)
 	sw	t1,4(a0)
 	sw	t2,8(a0)
 	sw	t3,12(a0)
 	sw	t4,16(a0)
 	sw	t5,20(a0)
 	sw	t6,24(a0)
 	sw	t7,28(a0)

 	LWHI	t0,32(a1)
 	LWLO	t0,35(a1)
 	LWHI	t1,36(a1)
 	bgtz	v1,.Lua_skip_pref30_128
 	LWLO	t1,39(a1)
 	pref    30,128(a0)   # continue setting up the dest, addr 128
 .Lua_skip_pref30_128:
 	LWHI	t2,40(a1)
 	LWLO	t2,43(a1)
 	LWHI	t3,44(a1)
 	LWLO	t3,47(a1)
 	LWHI	t4,48(a1)
 	LWLO	t4,51(a1)
 	LWHI	t5,52(a1)
 	LWLO	t5,55(a1)
 	LWHI	t6,56(a1)
 	LWLO	t6,59(a1)
 	LWHI	t7,60(a1)
 	LWLO	t7,63(a1)
         pref    0, 160(a1)    # bring the next lines of src, addr 160

 	sw	t0,32(a0)
 	sw	t1,36(a0)
 	sw	t2,40(a0)
 	sw	t3,44(a0)
 	sw	t4,48(a0)
 	sw	t5,52(a0)
 	sw	t6,56(a0)
 	sw	t7,60(a0)

 	addiu	a0,a0,64	# adding 64 to dest
 	sgtu	v1,a0,t9
 	bne	a0,a3,.Lua_loop16w
 	 addiu	a1,a1,64	# adding 64 to src
 	move	a2,t8

 # Here we have src and dest word-aligned but less than 64-bytes to go

 .Lua_chk8w:
 	pref 0, 0x0(a1)
 	andi	t8,a2,0x1f	# is there a 32-byte chunk?
 				# the t8 is the reminder count
 	beq	a2,t8,.Lua_chk1w # when a2=t8, no 32-byte chunk
 	 nop

 	LWHI	t0,0(a1)
 	LWLO	t0,3(a1)
 	LWHI	t1,4(a1)
 	LWLO	t1,7(a1)
 	LWHI	t2,8(a1)
 	LWLO	t2,11(a1)
 	LWHI	t3,12(a1)
 	LWLO	t3,15(a1)
 	LWHI	t4,16(a1)
 	LWLO	t4,19(a1)
 	LWHI	t5,20(a1)
 	LWLO	t5,23(a1)
 	LWHI	t6,24(a1)
 	LWLO	t6,27(a1)
 	LWHI	t7,28(a1)
 	LWLO	t7,31(a1)
 	addiu	a1,a1,32

 	sw	t0,0(a0)
 	sw	t1,4(a0)
 	sw	t2,8(a0)
 	sw	t3,12(a0)
 	sw	t4,16(a0)
 	sw	t5,20(a0)
 	sw	t6,24(a0)
 	sw	t7,28(a0)
 	addiu	a0,a0,32

 .Lua_chk1w:
 	andi	a2,t8,0x3	# now a2 is the reminder past 1w chunks
 	beq	a2,t8,.Lua_smallCopy
 	 subu	a3,t8,a2	# a3 is count of bytes in 1w chunks
 	addu	a3,a0,a3	# now a3 is the dst address past the 1w chunks

 # copying in words (4-byte chunks)
 .Lua_wordCopy_loop:
 	LWHI	v1,0(a1)
 	LWLO	v1,3(a1)
 	addiu	a1,a1,4
 	addiu	a0,a0,4		# note: dst=a0 is word aligned here, see NOTE1
 	bne	a0,a3,.Lua_wordCopy_loop
 	 sw	v1,-4(a0)

 # Now less than 4 bytes (value in a2) left to copy
 .Lua_smallCopy:
 	beqz	a2,.Lleave
 	addu	a3,a0,a2	# a3 is the last dst address
 .Lua_smallCopy_loop:
 	lb	v1,0(a1)
 	addiu	a1,a1,1
 	addiu	a0,a0,1
 	bne	a0,a3,.Lua_smallCopy_loop
 	 sb	v1,-1(a0)

 	j	ra
 	 nop

 	.set	at
 	.set	reorder

 END(memcpy)


 /************************************************************************
  *  Implementation : Static functions
  ************************************************************************/
	/*
	* Copyright (c) 2009
	* MIPS Technologies, Inc., California.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
	* contributors may be used to endorse or promote products derived from
	* this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/************************************************************************
	*
	* memcpy.S
	* Version: "043009"
	*
	************************************************************************/


	/************************************************************************
	* Include files
	************************************************************************/

	#include "machine/asm.h"


	/*
	* This routine could be optimized for MIPS64. The current code only
	* uses MIPS32 instructions.
	*/
	#if defined(__MIPSEB__)
	# define LWHI lwl /* high part is left in big-endian */
	# define SWHI swl /* high part is left in big-endian */
	# define LWLO lwr /* low part is right in big-endian */
	# define SWLO swr /* low part is right in big-endian */
	#endif

	#if defined(__MIPSEL__)
	# define LWHI lwr /* high part is right in little-endian */
	# define SWHI swr /* high part is right in little-endian */
	# define LWLO lwl /* low part is left in big-endian */
	# define SWLO swl /* low part is left in big-endian */
	#endif

	LEAF(memcpy,0)

	.set noreorder
	.set noat
	/*
	* Below we handle the case where memcpy is called with overlapping src and dst.
	* Although memcpy is not required to handle this case, some parts of Android like Skia
	* rely on such usage. We call memmove to handle such cases.
	*/
	subu t0,a0,a1
	sra AT,t0,31
	xor t1,t0,AT
	subu t0,t1,AT
	sltu AT,t0,a2
	beq AT,zero,.Lmemcpy
	la t9,memmove
	jr t9
	nop
	.Lmemcpy:
	slti AT,a2,8
	bne AT,zero,.Llast8
	move v0,a0 # memcpy returns the dst pointer

	# Test if the src and dst are word-aligned, or can be made word-aligned
	xor t8,a1,a0
	andi t8,t8,0x3 # t8 is a0/a1 word-displacement

	bne t8,zero,.Lunaligned
	negu a3,a0

	andi a3,a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
	beq a3,zero,.Lchk16w # when a3=0 then the dst (a0) is word-aligned
	subu a2,a2,a3 # now a2 is the remining bytes count

	LWHI t8,0(a1)
	addu a1,a1,a3
	SWHI t8,0(a0)
	addu a0,a0,a3

	# Now the dst/src are mutually word-aligned with word-aligned addresses
	.Lchk16w:
	andi t8,a2,0x3f # any whole 64-byte chunks?
	# t8 is the byte count after 64-byte chunks

	beq a2,t8,.Lchk8w # if a2==t8, no 64-byte chunks
	# There will be at most 1 32-byte chunk after it
	subu a3,a2,t8 # subtract from a2 the reminder
	# Here a3 counts bytes in 16w chunks
	addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks

	addu t0,a0,a2 # t0 is the "past the end" address

	# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
	# the "t0-32" address
	# This means: for x=128 the last "safe" a0 address is "t0-160"
	# Alternatively, for x=64 the last "safe" a0 address is "t0-96"
	# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
	subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address

	pref 0,0(a1) # bring the first line of src, addr 0
	pref 0,32(a1) # bring the second line of src, addr 32
	pref 0,64(a1) # bring the third line of src, addr 64
	pref 30,32(a0) # safe, as we have at least 64 bytes ahead
	# In case the a0 > t9 don't use "pref 30" at all
	sgtu v1,a0,t9
	bgtz v1,.Lloop16w # skip "pref 30,64(a0)" for too short arrays
	nop
	# otherwise, start with using pref30
	pref 30,64(a0)
	.Lloop16w:
	pref 0,96(a1)
	lw t0,0(a1)
	bgtz v1,.Lskip_pref30_96 # skip "pref 30,96(a0)"
	lw t1,4(a1)
	pref 30,96(a0) # continue setting up the dest, addr 96
	.Lskip_pref30_96:
	lw t2,8(a1)
	lw t3,12(a1)
	lw t4,16(a1)
	lw t5,20(a1)
	lw t6,24(a1)
	lw t7,28(a1)
	pref 0,128(a1) # bring the next lines of src, addr 128

	sw t0,0(a0)
	sw t1,4(a0)
	sw t2,8(a0)
	sw t3,12(a0)
	sw t4,16(a0)
	sw t5,20(a0)
	sw t6,24(a0)
	sw t7,28(a0)

	lw t0,32(a1)
	bgtz v1,.Lskip_pref30_128 # skip "pref 30,128(a0)"
	lw t1,36(a1)
	pref 30,128(a0) # continue setting up the dest, addr 128
	.Lskip_pref30_128:
	lw t2,40(a1)
	lw t3,44(a1)
	lw t4,48(a1)
	lw t5,52(a1)
	lw t6,56(a1)
	lw t7,60(a1)
	pref 0, 160(a1) # bring the next lines of src, addr 160

	sw t0,32(a0)
	sw t1,36(a0)
	sw t2,40(a0)
	sw t3,44(a0)
	sw t4,48(a0)
	sw t5,52(a0)
	sw t6,56(a0)
	sw t7,60(a0)

	addiu a0,a0,64 # adding 64 to dest
	sgtu v1,a0,t9
	bne a0,a3,.Lloop16w
	addiu a1,a1,64 # adding 64 to src
	move a2,t8

	# Here we have src and dest word-aligned but less than 64-bytes to go

	.Lchk8w:
	pref 0, 0x0(a1)
	andi t8,a2,0x1f # is there a 32-byte chunk?
	# the t8 is the reminder count past 32-bytes
	beq a2,t8,.Lchk1w # when a2=t8, no 32-byte chunk
	nop

	lw t0,0(a1)
	lw t1,4(a1)
	lw t2,8(a1)
	lw t3,12(a1)
	lw t4,16(a1)
	lw t5,20(a1)
	lw t6,24(a1)
	lw t7,28(a1)
	addiu a1,a1,32

	sw t0,0(a0)
	sw t1,4(a0)
	sw t2,8(a0)
	sw t3,12(a0)
	sw t4,16(a0)
	sw t5,20(a0)
	sw t6,24(a0)
	sw t7,28(a0)
	addiu a0,a0,32

	.Lchk1w:
	andi a2,t8,0x3 # now a2 is the reminder past 1w chunks
	beq a2,t8,.Llast8
	subu a3,t8,a2 # a3 is count of bytes in 1w chunks
	addu a3,a0,a3 # now a3 is the dst address past the 1w chunks

	# copying in words (4-byte chunks)
	.LwordCopy_loop:
	lw t3,0(a1) # the first t3 may be equal t0 ... optimize?
	addiu a1,a1,4
	addiu a0,a0,4
	bne a0,a3,.LwordCopy_loop
	sw t3,-4(a0)

	# For the last (<8) bytes
	.Llast8:
	blez a2,.Lleave
	addu a3,a0,a2 # a3 is the last dst address
	.Llast8loop:
	lb v1,0(a1)
	addiu a1,a1,1
	addiu a0,a0,1
	bne a0,a3,.Llast8loop
	sb v1,-1(a0)

	.Lleave:
	j ra
	nop

	#
	# UNALIGNED case
	#

	.Lunaligned:
	# got here with a3="negu a0"
	andi a3,a3,0x3 # test if the a0 is word aligned
	beqz a3,.Lua_chk16w
	subu a2,a2,a3 # bytes left after initial a3 bytes

	LWHI v1,0(a1)
	LWLO v1,3(a1)
	addu a1,a1,a3 # a3 may be here 1, 2 or 3
	SWHI v1,0(a0)
	addu a0,a0,a3 # below the dst will be word aligned (NOTE1)

	.Lua_chk16w:
	andi t8,a2,0x3f # any whole 64-byte chunks?
	# t8 is the byte count after 64-byte chunks
	beq a2,t8,.Lua_chk8w # if a2==t8, no 64-byte chunks
	# There will be at most 1 32-byte chunk after it
	subu a3,a2,t8 # subtract from a2 the reminder
	# Here a3 counts bytes in 16w chunks
	addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks

	addu t0,a0,a2 # t0 is the "past the end" address

	subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address

	pref 0,0(a1) # bring the first line of src, addr 0
	pref 0,32(a1) # bring the second line of src, addr 32
	pref 0,64(a1) # bring the third line of src, addr 64
	pref 30,32(a0) # safe, as we have at least 64 bytes ahead
	# In case the a0 > t9 don't use "pref 30" at all
	sgtu v1,a0,t9
	bgtz v1,.Lua_loop16w # skip "pref 30,64(a0)" for too short arrays
	nop
	# otherwise, start with using pref30
	pref 30,64(a0)
	.Lua_loop16w:
	pref 0,96(a1)
	LWHI t0,0(a1)
	LWLO t0,3(a1)
	LWHI t1,4(a1)
	bgtz v1,.Lua_skip_pref30_96
	LWLO t1,7(a1)
	pref 30,96(a0) # continue setting up the dest, addr 96
	.Lua_skip_pref30_96:
	LWHI t2,8(a1)
	LWLO t2,11(a1)
	LWHI t3,12(a1)
	LWLO t3,15(a1)
	LWHI t4,16(a1)
	LWLO t4,19(a1)
	LWHI t5,20(a1)
	LWLO t5,23(a1)
	LWHI t6,24(a1)
	LWLO t6,27(a1)
	LWHI t7,28(a1)
	LWLO t7,31(a1)
	pref 0,128(a1) # bring the next lines of src, addr 128

	sw t0,0(a0)
	sw t1,4(a0)
	sw t2,8(a0)
	sw t3,12(a0)
	sw t4,16(a0)
	sw t5,20(a0)
	sw t6,24(a0)
	sw t7,28(a0)

	LWHI t0,32(a1)
	LWLO t0,35(a1)
	LWHI t1,36(a1)
	bgtz v1,.Lua_skip_pref30_128
	LWLO t1,39(a1)
	pref 30,128(a0) # continue setting up the dest, addr 128
	.Lua_skip_pref30_128:
	LWHI t2,40(a1)
	LWLO t2,43(a1)
	LWHI t3,44(a1)
	LWLO t3,47(a1)
	LWHI t4,48(a1)
	LWLO t4,51(a1)
	LWHI t5,52(a1)
	LWLO t5,55(a1)
	LWHI t6,56(a1)
	LWLO t6,59(a1)
	LWHI t7,60(a1)
	LWLO t7,63(a1)
	pref 0, 160(a1) # bring the next lines of src, addr 160

	sw t0,32(a0)
	sw t1,36(a0)
	sw t2,40(a0)
	sw t3,44(a0)
	sw t4,48(a0)
	sw t5,52(a0)
	sw t6,56(a0)
	sw t7,60(a0)

	addiu a0,a0,64 # adding 64 to dest
	sgtu v1,a0,t9
	bne a0,a3,.Lua_loop16w
	addiu a1,a1,64 # adding 64 to src
	move a2,t8

	# Here we have src and dest word-aligned but less than 64-bytes to go

	.Lua_chk8w:
	pref 0, 0x0(a1)
	andi t8,a2,0x1f # is there a 32-byte chunk?
	# the t8 is the reminder count
	beq a2,t8,.Lua_chk1w # when a2=t8, no 32-byte chunk
	nop

	LWHI t0,0(a1)
	LWLO t0,3(a1)
	LWHI t1,4(a1)
	LWLO t1,7(a1)
	LWHI t2,8(a1)
	LWLO t2,11(a1)
	LWHI t3,12(a1)
	LWLO t3,15(a1)
	LWHI t4,16(a1)
	LWLO t4,19(a1)
	LWHI t5,20(a1)
	LWLO t5,23(a1)
	LWHI t6,24(a1)
	LWLO t6,27(a1)
	LWHI t7,28(a1)
	LWLO t7,31(a1)
	addiu a1,a1,32

	sw t0,0(a0)
	sw t1,4(a0)
	sw t2,8(a0)
	sw t3,12(a0)
	sw t4,16(a0)
	sw t5,20(a0)
	sw t6,24(a0)
	sw t7,28(a0)
	addiu a0,a0,32

	.Lua_chk1w:
	andi a2,t8,0x3 # now a2 is the reminder past 1w chunks
	beq a2,t8,.Lua_smallCopy
	subu a3,t8,a2 # a3 is count of bytes in 1w chunks
	addu a3,a0,a3 # now a3 is the dst address past the 1w chunks

	# copying in words (4-byte chunks)
	.Lua_wordCopy_loop:
	LWHI v1,0(a1)
	LWLO v1,3(a1)
	addiu a1,a1,4
	addiu a0,a0,4 # note: dst=a0 is word aligned here, see NOTE1
	bne a0,a3,.Lua_wordCopy_loop
	sw v1,-4(a0)

	# Now less than 4 bytes (value in a2) left to copy
	.Lua_smallCopy:
	beqz a2,.Lleave
	addu a3,a0,a2 # a3 is the last dst address
	.Lua_smallCopy_loop:
	lb v1,0(a1)
	addiu a1,a1,1
	addiu a0,a0,1
	bne a0,a3,.Lua_smallCopy_loop
	sb v1,-1(a0)

	j ra
	nop

	.set at
	.set reorder

	END(memcpy)


	/************************************************************************
	* Implementation : Static functions
	************************************************************************/