1/*
2   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
3
4   This file is subject to the terms and conditions of the GNU General Public
5   License.  See the file "COPYING" in the main directory of this archive
6   for more details.
7
8   Tight version of mempy for the case of just copying a page.
9   Prefetch strategy empirically optimised against RTL simulations
10   of SH5-101 cut2 eval chip with Cayman board DDR memory.
11
12   Parameters:
13   r2 : source effective address (start of page)
14   r3 : destination effective address (start of page)
15
16   Always copies 4096 bytes.
17
18   Points to review.
19   * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
20     It seems like the prefetch needs to be at at least 4 lines ahead to get
21     the data into the cache in time, and the allocos contend with outstanding
22     prefetches for the same cache set, so it's better to have the numbers
23     different.
24   */
25
26	.section .text..SHmedia32,"ax"
27	.little
28
29	.balign 8
30	.global sh64_page_copy
31sh64_page_copy:
32
33	/* Copy 4096 bytes worth of data from r2 to r3.
34	   Do prefetches 4 lines ahead.
35	   Do alloco 2 lines ahead */
36
37	pta 1f, tr1
38	pta 2f, tr2
39	pta 3f, tr3
40	ptabs r18, tr0
41
42	alloco r3, 0x00
43	synco		! TAKum03020
44	alloco r3, 0x20
45	synco		! TAKum03020
46
47	movi 3968, r6
48	add  r3, r6, r6
49	addi r6, 64, r7
50	addi r7, 64, r8
51	sub r2, r3, r60
52	addi r60, 8, r61
53	addi r61, 8, r62
54	addi r62, 8, r23
55	addi r60, 0x80, r22
56
57/* Minimal code size.  The extra branches inside the loop don't cost much
58   because they overlap with the time spent waiting for prefetches to
59   complete. */
601:
612:
62	bge/u r3, r7, tr3  ! skip alloco for last 2 lines
63	alloco r3, 0x40    ! alloc destination line 2 lines ahead
64	synco		! TAKum03020
653:
66	ldx.q r3, r60, r36
67	ldx.q r3, r61, r37
68	ldx.q r3, r62, r38
69	ldx.q r3, r23, r39
70	st.q  r3,   0, r36
71	st.q  r3,   8, r37
72	st.q  r3,  16, r38
73	st.q  r3,  24, r39
74	addi r3, 32, r3
75	bgt/l r8, r3, tr1
76
77	blink tr0, r63	   ! return
78