/* ********************************************************************* * Broadcom Common Firmware Environment (CFE) * * Special, fast memcpy for SB-1 core File: sb1250_memcpy.S * * This module contains an optimized memcpy for the BCM1250. * * Author: Mark Vandevoorde * ********************************************************************* * * Copyright 2000,2001,2002,2003 * Broadcom Corporation. All rights reserved. * * This software is furnished under license and may be used and * copied only in accordance with the following terms and * conditions. Subject to these conditions, you may download, * copy, install, use, modify and distribute modified or unmodified * copies of this software in source and/or binary form. No title * or ownership is transferred hereby. * * 1) Any source code used, modified or distributed must reproduce * and retain this copyright notice and list of conditions * as they appear in the source file. * * 2) No right is granted to use any trade name, trademark, or * logo of Broadcom Corporation. The "Broadcom Corporation" * name may not be used to endorse or promote products derived * from this software without the prior written permission of * Broadcom Corporation. * * 3) THIS SOFTWARE IS PROVIDED "AS-IS" AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING BUT NOT LIMITED TO, ANY IMPLIED * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR * PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT * SHALL BROADCOM BE LIABLE FOR ANY DAMAGES WHATSOEVER, AND IN * PARTICULAR, BROADCOM SHALL NOT BE LIABLE FOR DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE), EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. ********************************************************************* */ #include #if defined(__long64) #define LOAD ld #define LOADL ldl #define LOADR ldr #define STOREL sdl #define STORER sdr #define STORE sd #define ADD daddu #define SUB dsubu #define SRL dsrl #define SRA dsra #define SLL dsll #define SLLV dsllv #define SRLV dsrlv #define NBYTES 8 #define LOG_NBYTES 3 #else #define LOAD lw #define LOADL lwl #define LOADR lwr #define STOREL swl #define STORER swr #define STORE sw #define ADD addu #define SUB subu #define SRL srl #define SLL sll #define SRA sra #define SLLV sllv #define SRLV srlv #define NBYTES 4 #define LOG_NBYTES 2 #endif /* __long64 */ #if defined(__MIPSEB) #define LDFIRST LOADL #define LDREST LOADR #define STFIRST STOREL #define SHIFT_CPY SRLV #endif #if defined(__MIPSEL) #define LDFIRST LOADR #define LDREST LOADL #define STFIRST STORER #define SHIFT_CPY SLLV #endif #define FIRST(unit) ((unit)*NBYTES) #define REST(unit) (FIRST(unit)+NBYTES-1) #define DEST(unit) FIRST(unit) #define ADDRMASK (NBYTES-1) #ifdef _M1WORKAROUND_ #define USE(x) lbu AT,x; xor zero, AT, zero #else #define USE(x) #endif .text .set noreorder .set noat .set mips64 .globl memcpy_sb1 .ent memcpy_sb1 .align 4 memcpy_sb1: ## Arguments. Note: dst & src may be unaligned, len may be 0 #define dst a0 #define src a1 #define len a2 ## Temps #define count a3 #define match t8 #define dst2 match ## The "issue break"s below are very approximate. ## Issue delays for dcache fills will perturb the schedule, as will ## load queue full replay traps, etc. ## If len < NBYTES use byte operations pref 0, 0(src) pref 1, 0(dst) sltu t2, len, NBYTES # long pipe and t1, dst, ADDRMASK ## issue break pref 0, 1*32(src) pref 1, 1*32(dst) bnez t2, copy_bytes_checklen and t0, src, ADDRMASK ## issue break pref 0, 2*32(src) bnez t1, dst_unaligned pref 1, 2*32(dst) ## issue break bnez t0, src_unaligned_dst_aligned ## use delay slot for fall-through ## src and dst are aligned; need to compute count both_aligned: SRL count, len, LOG_NBYTES+3 # +3 for 8 units/iter ADD dst2, dst, 8*NBYTES ## issue break beqz count, cleanup_both_aligned sltu t7, len, 4*NBYTES # useful only when branch taken pref 0, 3*32(src) pref 1, 3*32(dst) and len, len, NBYTES*8-1 # len = what's left to copy at loop exit .align 4 #ifdef PASS1 1: ## issue break LOAD t0, FIRST(0)(src) LOAD t1, FIRST(1)(src) ## issue break ssnop # To avoid dcfifo full, we schedule 2 cycles of empty LS pipes ssnop # during which the DCFIFO can complete 4 8-byte stores. Pass3 ssnop # should not need these extra cycles. The other 32 # bytes are completed during the 4 cycles where STORE is issued # to LS1. # Note: on Pass1, we need N+1 ssnops to pause N cycles. ## issue break LOAD t2, FIRST(2)(src) LOAD t3, FIRST(3)(src) SUB count, count, 1 ## issue break LOAD t4, FIRST(4)(src) USE( FIRST(0-8)(dst2) ) LOAD t5, FIRST(5)(src) ## issue break STORE t0, FIRST(0-8)(dst2) # PASS1: >= 3 cycles after load ## issue break #ifdef __long64 USE( FIRST(4-8)(dst2) ) #else USE( FIRST(7-8)(dst2) ) #endif STORE t1, FIRST(1-8)(dst2) ## issue break LOAD t6, FIRST(6)(src) LOAD t7, FIRST(7)(src) ADD src, src, 8*NBYTES # long pipe OK ADD dst2, dst2, 8*NBYTES # long pipe OK #ifdef __long64 USE( FIRST(7)(dst) ) #endif ## issue break STORE t2, FIRST(2)(dst) STORE t3, FIRST(3)(dst) ## issue break STORE t4, FIRST(4)(dst) STORE t5, FIRST(5)(dst) ## issue break STORE t6, FIRST(6)(dst) STORE t7, FIRST(7)(dst) #ifdef __long64 ## issue break pref 1, 7*32(dst) pref 1, 8*32(dst) ADD dst, dst, 8*NBYTES # long pipe OK ## issue break pref 0, 6*32(src) bnez count, 1b pref 0, 7*32(src) #else ## issue break pref 1, 8*32(dst) pref 0, 7*32(src) ## issue break bnez count, 1b ADD dst, dst, 8*NBYTES # long pipe OK #endif #else // not PASS1 /* USE_DOUBLE _M1WORKAROUND_ DIST cycles/cacheline yes no L1 5 yes yes L1 6 no no L1 9 no yes L1 10 */ #define src2 t7 1: #ifdef _M1WORKAROUND_ lbu AT, FIRST(0-8)(dst2) lbu t0, FIRST(7-8)(dst2) xor zero, AT, zero xor zero, t0, zero # ifdef __long64 lbu AT, FIRST(4-8)(dst2) ADD zero, 1 # L1 NOP xor zero, AT, zero # endif #endif ## issue break LOAD t0, FIRST(0)(src) STORE t0, FIRST(0-8)(dst2) ADD src2, src, 0 # long pipe OK ## issue break LOAD t0, FIRST(1)(src) STORE t0, FIRST(1-8)(dst2) ## issue break LOAD t0, FIRST(2)(src) STORE t0, FIRST(2-8)(dst2) ADD dst2, dst2, 8*NBYTES # long pipe OK ## issue break LOAD t0, FIRST(3)(src) STORE t0, FIRST(3)(dst) ## issue break LOAD t0, FIRST(4)(src) STORE t0, FIRST(4)(dst) ADD src, src, 8*NBYTES # long pipe OK ## issue break LOAD t0, FIRST(5)(src2) STORE t0, FIRST(5)(dst) SUB count, count, 1 ## issue break LOAD t0, FIRST(6)(src2) STORE t0, FIRST(6)(dst) ## issue break LOAD t0, FIRST(7)(src2) STORE t0, FIRST(7)(dst) ADD dst, dst, 8*NBYTES # long pipe OK #ifdef __long64 ## issue break pref 1, 6*32(dst2) pref 1, 7*32(dst2) ## issue break pref 0, 7*32(src2) bnez count, 1b pref 0, 8*32(src2) #else ## issue break pref 1, 7*32(dst2) bnez count, 1b pref 0, 8*32(src2) #endif #endif // ifdef-else PASS1 #define rem t6 sltu t7, len, 4*NBYTES # inst duplicated in delay slot of branch cleanup_both_aligned: beqz len, done and rem, len, ADDRMASK # rem = len not multiple of NBYTES ## len > 0; must be safe to read NBYTES bnez t7, less_than_4units SRL count, len, LOG_NBYTES # count = number of units left LOAD t0, FIRST(0)(src) LOAD t1, FIRST(1)(src) SUB len, len, 4*NBYTES LOAD t2, FIRST(2)(src) SUB count, count, 4 ADD dst, dst, 4*NBYTES LOAD t3, FIRST(3)(src) ADD src, src, 4*NBYTES USE( FIRST(-5)(dst2) ) STORE t0, FIRST(-8)(dst2) # PASS1: >= 3 cycles after load STORE t1, FIRST(-7)(dst2) STORE t2, FIRST(-6)(dst2) beqz len, done STORE t3, FIRST(-5)(dst2) less_than_4units: beqz count, copy_bytes_aligned # rem > 0 nop ## count is number of full units left ## rem is len & ADDRMASK copy_units_aligned: LOAD t0, FIRST(0)(src) ADD src, src, NBYTES SUB count, count, 1 USE( FIRST(0)(dst) ) STORE t0, FIRST(0)(dst) bnez count, copy_units_aligned ADD dst, dst, NBYTES ## src and dst are aligned; need to copy rem bytes ## Do an explicit read/mask/or/write (eliminates branches) #define mask count #define mask2 len beqz rem, done copy_bytes_aligned: SUB mask, zero, 1 SLL rem, rem, 3 # rem = number of bits to copy LOAD t0, FIRST(0)(src) LOAD t1, FIRST(0)(dst) ## clear bits in mask where data should be copied SHIFT_CPY mask, mask, rem nor mask2, mask, zero and t1, mask, t1 # t1 = dst bytes to keep and t0, mask2, t0 # t0 = src bytes to copy or t0, t0, t1 USE( FIRST(0)(dst) ) jr ra STORE t0, FIRST(0)(dst) dst_unaligned: ## dst is unaligned ## t0 = src & ADDRMASK ## t1 = dst & ADDRMASK; T1 > 0 ## len >= NBYTES ## Copy enough bytes to align dst ## Set match = (src and dst have same alignment) LDFIRST t3,FIRST(0)(src) ADD t2, zero, NBYTES LDREST t3,REST(0)(src) SUB t2, t2, t1 # t2 = number of bytes copied xor match, t0, t1 SUB len, len, t2 USE( FIRST(0)(dst) ) beqz len, done STFIRST t3,FIRST(0)(dst) ADD dst, dst, t2 beqz match, both_aligned ADD src, src, t2 src_unaligned_dst_aligned: ## Set count = number of iters for main loop ## Update len, src, and dst for number of bytes copied SRL count, len, LOG_NBYTES+2 # +2 for 4 units/iter pref 0, 3*32(src) beqz count, cleanup_src_unaligned and rem, len, ADDRMASK # rem = len % NBYTES pref 1, 3*32(dst) USE( DEST(0)(dst) ) .align 4 1: ## issue break LDFIRST t0, FIRST(0)(src) LDFIRST t1, FIRST(1)(src) SUB len, len, 4*NBYTES # hoist out of loop? ## issue break LDREST t0, REST(0)(src) LDREST t1, REST(1)(src) SUB count, count, 1 ## issue break LDFIRST t2, FIRST(2)(src) LDFIRST t3, FIRST(3)(src) ## issue break LDREST t2, REST(2)(src) LDREST t3, REST(3)(src) ## issue break pref 0, 7*32(src) # 0 is PREF_LOAD (not streamed) ADD src, src, 4*NBYTES # long pipe pref 1, 7*32(dst) # 1 is PREF_STORE (not streamed) ## issue break USE( DEST(3)(dst) ) STORE t0, DEST(0)(dst) # must be 3 cycles after LDREST STORE t1, DEST(1)(dst) ## issue break STORE t2, DEST(2)(dst) # must be 3 cycles after LDREST STORE t3, DEST(3)(dst) ## issue break bnez count, 1b ADD dst, dst, 4*NBYTES # long pipe? cleanup_src_unaligned: beqz len, done SRL count, len, LOG_NBYTES # count = number of units left beqz count, copy_bytes # if this branch fall copy_units_src_unaligned: LDFIRST t0, FIRST(0)(src) LDREST t0, REST(0)(src) ADD src, src, NBYTES SUB count, count, 1 USE( FIRST(0)(dst) ) STORE t0, FIRST(0)(dst) ADD dst, dst, NBYTES bnez count, copy_units_src_unaligned SUB len, len, NBYTES copy_bytes_checklen: beqz len, done copy_bytes: SUB len, len, 1 USE( 0(dst) ) USE( NBYTES-2(dst) ) lb t0, 0(src) beqz len, done sb t0, 0(dst) #define COPY_BYTE(N) \ lb t0, N(src); \ SUB len, len, 1; \ beqz len, done; \ sb t0, N(dst); COPY_BYTE(1) COPY_BYTE(2) #ifdef __long64 COPY_BYTE(3) COPY_BYTE(4) COPY_BYTE(5) COPY_BYTE(6) #endif lb t0, NBYTES-1(src) jr ra sb t0, NBYTES-1(dst) done: jr ra nop .end memcpy_sb1 ### The danger of using streaming prefetches is that one stream ### may evict the other before the cpu consumes it.