| /* |
| * Copyright (c) 2009 |
| * MIPS Technologies, Inc., California. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE |
| * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| * SUCH DAMAGE. |
| */ |
| |
| /************************************************************************ |
| * |
| * memcpy.S |
| * Version: "043009" |
| * |
| ************************************************************************/ |
| |
| |
| /************************************************************************ |
| * Include files |
| ************************************************************************/ |
| |
| #include "machine/asm.h" |
| |
| |
| /* |
| * This routine could be optimized for MIPS64. The current code only |
| * uses MIPS32 instructions. |
| */ |
| #if defined(__MIPSEB__) |
| # define LWHI lwl /* high part is left in big-endian */ |
| # define SWHI swl /* high part is left in big-endian */ |
| # define LWLO lwr /* low part is right in big-endian */ |
| # define SWLO swr /* low part is right in big-endian */ |
| #endif |
| |
| #if defined(__MIPSEL__) |
| # define LWHI lwr /* high part is right in little-endian */ |
| # define SWHI swr /* high part is right in little-endian */ |
| # define LWLO lwl /* low part is left in big-endian */ |
| # define SWLO swl /* low part is left in big-endian */ |
| #endif |
| |
| LEAF(memcpy,0) |
| |
| .set noreorder |
| .set noat |
| /* |
| * Below we handle the case where memcpy is called with overlapping src and dst. |
| * Although memcpy is not required to handle this case, some parts of Android like Skia |
| * rely on such usage. We call memmove to handle such cases. |
| */ |
| subu t0,a0,a1 |
| sra AT,t0,31 |
| xor t1,t0,AT |
| subu t0,t1,AT |
| sltu AT,t0,a2 |
| beq AT,zero,.Lmemcpy |
| la t9,memmove |
| jr t9 |
| nop |
| .Lmemcpy: |
| slti AT,a2,8 |
| bne AT,zero,.Llast8 |
| move v0,a0 # memcpy returns the dst pointer |
| |
| # Test if the src and dst are word-aligned, or can be made word-aligned |
| xor t8,a1,a0 |
| andi t8,t8,0x3 # t8 is a0/a1 word-displacement |
| |
| bne t8,zero,.Lunaligned |
| negu a3,a0 |
| |
| andi a3,a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned |
| beq a3,zero,.Lchk16w # when a3=0 then the dst (a0) is word-aligned |
| subu a2,a2,a3 # now a2 is the remining bytes count |
| |
| LWHI t8,0(a1) |
| addu a1,a1,a3 |
| SWHI t8,0(a0) |
| addu a0,a0,a3 |
| |
| # Now the dst/src are mutually word-aligned with word-aligned addresses |
| .Lchk16w: |
| andi t8,a2,0x3f # any whole 64-byte chunks? |
| # t8 is the byte count after 64-byte chunks |
| |
| beq a2,t8,.Lchk8w # if a2==t8, no 64-byte chunks |
| # There will be at most 1 32-byte chunk after it |
| subu a3,a2,t8 # subtract from a2 the reminder |
| # Here a3 counts bytes in 16w chunks |
| addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks |
| |
| addu t0,a0,a2 # t0 is the "past the end" address |
| |
| # When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past |
| # the "t0-32" address |
| # This means: for x=128 the last "safe" a0 address is "t0-160" |
| # Alternatively, for x=64 the last "safe" a0 address is "t0-96" |
| # In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit |
| subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address |
| |
| pref 0,0(a1) # bring the first line of src, addr 0 |
| pref 0,32(a1) # bring the second line of src, addr 32 |
| pref 0,64(a1) # bring the third line of src, addr 64 |
| pref 30,32(a0) # safe, as we have at least 64 bytes ahead |
| # In case the a0 > t9 don't use "pref 30" at all |
| sgtu v1,a0,t9 |
| bgtz v1,.Lloop16w # skip "pref 30,64(a0)" for too short arrays |
| nop |
| # otherwise, start with using pref30 |
| pref 30,64(a0) |
| .Lloop16w: |
| pref 0,96(a1) |
| lw t0,0(a1) |
| bgtz v1,.Lskip_pref30_96 # skip "pref 30,96(a0)" |
| lw t1,4(a1) |
| pref 30,96(a0) # continue setting up the dest, addr 96 |
| .Lskip_pref30_96: |
| lw t2,8(a1) |
| lw t3,12(a1) |
| lw t4,16(a1) |
| lw t5,20(a1) |
| lw t6,24(a1) |
| lw t7,28(a1) |
| pref 0,128(a1) # bring the next lines of src, addr 128 |
| |
| sw t0,0(a0) |
| sw t1,4(a0) |
| sw t2,8(a0) |
| sw t3,12(a0) |
| sw t4,16(a0) |
| sw t5,20(a0) |
| sw t6,24(a0) |
| sw t7,28(a0) |
| |
| lw t0,32(a1) |
| bgtz v1,.Lskip_pref30_128 # skip "pref 30,128(a0)" |
| lw t1,36(a1) |
| pref 30,128(a0) # continue setting up the dest, addr 128 |
| .Lskip_pref30_128: |
| lw t2,40(a1) |
| lw t3,44(a1) |
| lw t4,48(a1) |
| lw t5,52(a1) |
| lw t6,56(a1) |
| lw t7,60(a1) |
| pref 0, 160(a1) # bring the next lines of src, addr 160 |
| |
| sw t0,32(a0) |
| sw t1,36(a0) |
| sw t2,40(a0) |
| sw t3,44(a0) |
| sw t4,48(a0) |
| sw t5,52(a0) |
| sw t6,56(a0) |
| sw t7,60(a0) |
| |
| addiu a0,a0,64 # adding 64 to dest |
| sgtu v1,a0,t9 |
| bne a0,a3,.Lloop16w |
| addiu a1,a1,64 # adding 64 to src |
| move a2,t8 |
| |
| # Here we have src and dest word-aligned but less than 64-bytes to go |
| |
| .Lchk8w: |
| pref 0, 0x0(a1) |
| andi t8,a2,0x1f # is there a 32-byte chunk? |
| # the t8 is the reminder count past 32-bytes |
| beq a2,t8,.Lchk1w # when a2=t8, no 32-byte chunk |
| nop |
| |
| lw t0,0(a1) |
| lw t1,4(a1) |
| lw t2,8(a1) |
| lw t3,12(a1) |
| lw t4,16(a1) |
| lw t5,20(a1) |
| lw t6,24(a1) |
| lw t7,28(a1) |
| addiu a1,a1,32 |
| |
| sw t0,0(a0) |
| sw t1,4(a0) |
| sw t2,8(a0) |
| sw t3,12(a0) |
| sw t4,16(a0) |
| sw t5,20(a0) |
| sw t6,24(a0) |
| sw t7,28(a0) |
| addiu a0,a0,32 |
| |
| .Lchk1w: |
| andi a2,t8,0x3 # now a2 is the reminder past 1w chunks |
| beq a2,t8,.Llast8 |
| subu a3,t8,a2 # a3 is count of bytes in 1w chunks |
| addu a3,a0,a3 # now a3 is the dst address past the 1w chunks |
| |
| # copying in words (4-byte chunks) |
| .LwordCopy_loop: |
| lw t3,0(a1) # the first t3 may be equal t0 ... optimize? |
| addiu a1,a1,4 |
| addiu a0,a0,4 |
| bne a0,a3,.LwordCopy_loop |
| sw t3,-4(a0) |
| |
| # For the last (<8) bytes |
| .Llast8: |
| blez a2,.Lleave |
| addu a3,a0,a2 # a3 is the last dst address |
| .Llast8loop: |
| lb v1,0(a1) |
| addiu a1,a1,1 |
| addiu a0,a0,1 |
| bne a0,a3,.Llast8loop |
| sb v1,-1(a0) |
| |
| .Lleave: |
| j ra |
| nop |
| |
| # |
| # UNALIGNED case |
| # |
| |
| .Lunaligned: |
| # got here with a3="negu a0" |
| andi a3,a3,0x3 # test if the a0 is word aligned |
| beqz a3,.Lua_chk16w |
| subu a2,a2,a3 # bytes left after initial a3 bytes |
| |
| LWHI v1,0(a1) |
| LWLO v1,3(a1) |
| addu a1,a1,a3 # a3 may be here 1, 2 or 3 |
| SWHI v1,0(a0) |
| addu a0,a0,a3 # below the dst will be word aligned (NOTE1) |
| |
| .Lua_chk16w: |
| andi t8,a2,0x3f # any whole 64-byte chunks? |
| # t8 is the byte count after 64-byte chunks |
| beq a2,t8,.Lua_chk8w # if a2==t8, no 64-byte chunks |
| # There will be at most 1 32-byte chunk after it |
| subu a3,a2,t8 # subtract from a2 the reminder |
| # Here a3 counts bytes in 16w chunks |
| addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks |
| |
| addu t0,a0,a2 # t0 is the "past the end" address |
| |
| subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address |
| |
| pref 0,0(a1) # bring the first line of src, addr 0 |
| pref 0,32(a1) # bring the second line of src, addr 32 |
| pref 0,64(a1) # bring the third line of src, addr 64 |
| pref 30,32(a0) # safe, as we have at least 64 bytes ahead |
| # In case the a0 > t9 don't use "pref 30" at all |
| sgtu v1,a0,t9 |
| bgtz v1,.Lua_loop16w # skip "pref 30,64(a0)" for too short arrays |
| nop |
| # otherwise, start with using pref30 |
| pref 30,64(a0) |
| .Lua_loop16w: |
| pref 0,96(a1) |
| LWHI t0,0(a1) |
| LWLO t0,3(a1) |
| LWHI t1,4(a1) |
| bgtz v1,.Lua_skip_pref30_96 |
| LWLO t1,7(a1) |
| pref 30,96(a0) # continue setting up the dest, addr 96 |
| .Lua_skip_pref30_96: |
| LWHI t2,8(a1) |
| LWLO t2,11(a1) |
| LWHI t3,12(a1) |
| LWLO t3,15(a1) |
| LWHI t4,16(a1) |
| LWLO t4,19(a1) |
| LWHI t5,20(a1) |
| LWLO t5,23(a1) |
| LWHI t6,24(a1) |
| LWLO t6,27(a1) |
| LWHI t7,28(a1) |
| LWLO t7,31(a1) |
| pref 0,128(a1) # bring the next lines of src, addr 128 |
| |
| sw t0,0(a0) |
| sw t1,4(a0) |
| sw t2,8(a0) |
| sw t3,12(a0) |
| sw t4,16(a0) |
| sw t5,20(a0) |
| sw t6,24(a0) |
| sw t7,28(a0) |
| |
| LWHI t0,32(a1) |
| LWLO t0,35(a1) |
| LWHI t1,36(a1) |
| bgtz v1,.Lua_skip_pref30_128 |
| LWLO t1,39(a1) |
| pref 30,128(a0) # continue setting up the dest, addr 128 |
| .Lua_skip_pref30_128: |
| LWHI t2,40(a1) |
| LWLO t2,43(a1) |
| LWHI t3,44(a1) |
| LWLO t3,47(a1) |
| LWHI t4,48(a1) |
| LWLO t4,51(a1) |
| LWHI t5,52(a1) |
| LWLO t5,55(a1) |
| LWHI t6,56(a1) |
| LWLO t6,59(a1) |
| LWHI t7,60(a1) |
| LWLO t7,63(a1) |
| pref 0, 160(a1) # bring the next lines of src, addr 160 |
| |
| sw t0,32(a0) |
| sw t1,36(a0) |
| sw t2,40(a0) |
| sw t3,44(a0) |
| sw t4,48(a0) |
| sw t5,52(a0) |
| sw t6,56(a0) |
| sw t7,60(a0) |
| |
| addiu a0,a0,64 # adding 64 to dest |
| sgtu v1,a0,t9 |
| bne a0,a3,.Lua_loop16w |
| addiu a1,a1,64 # adding 64 to src |
| move a2,t8 |
| |
| # Here we have src and dest word-aligned but less than 64-bytes to go |
| |
| .Lua_chk8w: |
| pref 0, 0x0(a1) |
| andi t8,a2,0x1f # is there a 32-byte chunk? |
| # the t8 is the reminder count |
| beq a2,t8,.Lua_chk1w # when a2=t8, no 32-byte chunk |
| nop |
| |
| LWHI t0,0(a1) |
| LWLO t0,3(a1) |
| LWHI t1,4(a1) |
| LWLO t1,7(a1) |
| LWHI t2,8(a1) |
| LWLO t2,11(a1) |
| LWHI t3,12(a1) |
| LWLO t3,15(a1) |
| LWHI t4,16(a1) |
| LWLO t4,19(a1) |
| LWHI t5,20(a1) |
| LWLO t5,23(a1) |
| LWHI t6,24(a1) |
| LWLO t6,27(a1) |
| LWHI t7,28(a1) |
| LWLO t7,31(a1) |
| addiu a1,a1,32 |
| |
| sw t0,0(a0) |
| sw t1,4(a0) |
| sw t2,8(a0) |
| sw t3,12(a0) |
| sw t4,16(a0) |
| sw t5,20(a0) |
| sw t6,24(a0) |
| sw t7,28(a0) |
| addiu a0,a0,32 |
| |
| .Lua_chk1w: |
| andi a2,t8,0x3 # now a2 is the reminder past 1w chunks |
| beq a2,t8,.Lua_smallCopy |
| subu a3,t8,a2 # a3 is count of bytes in 1w chunks |
| addu a3,a0,a3 # now a3 is the dst address past the 1w chunks |
| |
| # copying in words (4-byte chunks) |
| .Lua_wordCopy_loop: |
| LWHI v1,0(a1) |
| LWLO v1,3(a1) |
| addiu a1,a1,4 |
| addiu a0,a0,4 # note: dst=a0 is word aligned here, see NOTE1 |
| bne a0,a3,.Lua_wordCopy_loop |
| sw v1,-4(a0) |
| |
| # Now less than 4 bytes (value in a2) left to copy |
| .Lua_smallCopy: |
| beqz a2,.Lleave |
| addu a3,a0,a2 # a3 is the last dst address |
| .Lua_smallCopy_loop: |
| lb v1,0(a1) |
| addiu a1,a1,1 |
| addiu a0,a0,1 |
| bne a0,a3,.Lua_smallCopy_loop |
| sb v1,-1(a0) |
| |
| j ra |
| nop |
| |
| .set at |
| .set reorder |
| |
| END(memcpy) |
| |
| |
| /************************************************************************ |
| * Implementation : Static functions |
| ************************************************************************/ |