1/* 2 Copyright 2003 Richard Curnow, SuperH (UK) Ltd. 3 4 This file is subject to the terms and conditions of the GNU General Public 5 License. See the file "COPYING" in the main directory of this archive 6 for more details. 7 8 Tight version of mempy for the case of just copying a page. 9 Prefetch strategy empirically optimised against RTL simulations 10 of SH5-101 cut2 eval chip with Cayman board DDR memory. 11 12 Parameters: 13 r2 : source effective address (start of page) 14 r3 : destination effective address (start of page) 15 16 Always copies 4096 bytes. 17 18 Points to review. 19 * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead. 20 It seems like the prefetch needs to be at at least 4 lines ahead to get 21 the data into the cache in time, and the allocos contend with outstanding 22 prefetches for the same cache set, so it's better to have the numbers 23 different. 24 */ 25 26 .section .text..SHmedia32,"ax" 27 .little 28 29 .balign 8 30 .global sh64_page_copy 31sh64_page_copy: 32 33 /* Copy 4096 bytes worth of data from r2 to r3. 34 Do prefetches 4 lines ahead. 35 Do alloco 2 lines ahead */ 36 37 pta 1f, tr1 38 pta 2f, tr2 39 pta 3f, tr3 40 ptabs r18, tr0 41 42 alloco r3, 0x00 43 synco ! TAKum03020 44 alloco r3, 0x20 45 synco ! TAKum03020 46 47 movi 3968, r6 48 add r3, r6, r6 49 addi r6, 64, r7 50 addi r7, 64, r8 51 sub r2, r3, r60 52 addi r60, 8, r61 53 addi r61, 8, r62 54 addi r62, 8, r23 55 addi r60, 0x80, r22 56 57/* Minimal code size. The extra branches inside the loop don't cost much 58 because they overlap with the time spent waiting for prefetches to 59 complete. */ 601: 612: 62 bge/u r3, r7, tr3 ! skip alloco for last 2 lines 63 alloco r3, 0x40 ! alloc destination line 2 lines ahead 64 synco ! TAKum03020 653: 66 ldx.q r3, r60, r36 67 ldx.q r3, r61, r37 68 ldx.q r3, r62, r38 69 ldx.q r3, r23, r39 70 st.q r3, 0, r36 71 st.q r3, 8, r37 72 st.q r3, 16, r38 73 st.q r3, 24, r39 74 addi r3, 32, r3 75 bgt/l r8, r3, tr1 76 77 blink tr0, r63 ! return 78