1/* NGmemcpy.S: Niagara optimized memcpy.
2 *
3 * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
4 */
5
6#ifdef __KERNEL__
7#include <asm/asi.h>
8#include <asm/thread_info.h>
9#define GLOBAL_SPARE	%g7
10#define RESTORE_ASI(TMP)	\
11	ldub	[%g6 + TI_CURRENT_DS], TMP;  \
12	wr	TMP, 0x0, %asi;
13#else
14#define GLOBAL_SPARE	%g5
15#define RESTORE_ASI(TMP)	\
16	wr	%g0, ASI_PNF, %asi
17#endif
18
19#ifndef STORE_ASI
20#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
21#endif
22
23#ifndef EX_LD
24#define EX_LD(x)	x
25#endif
26
27#ifndef EX_ST
28#define EX_ST(x)	x
29#endif
30
31#ifndef EX_RETVAL
32#define EX_RETVAL(x)	x
33#endif
34
35#ifndef LOAD
36#ifndef MEMCPY_DEBUG
37#define LOAD(type,addr,dest)	type [addr], dest
38#else
39#define LOAD(type,addr,dest)	type##a [addr] 0x80, dest
40#endif
41#endif
42
43#ifndef LOAD_TWIN
44#define LOAD_TWIN(addr_reg,dest0,dest1)	\
45	ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
46#endif
47
48#ifndef STORE
49#define STORE(type,src,addr)	type src, [addr]
50#endif
51
52#ifndef STORE_INIT
53#define STORE_INIT(src,addr)	stxa src, [addr] %asi
54#endif
55
56#ifndef FUNC_NAME
57#define FUNC_NAME	NGmemcpy
58#endif
59
60#ifndef PREAMBLE
61#define PREAMBLE
62#endif
63
64#ifndef XCC
65#define XCC xcc
66#endif
67
68	.register	%g2,#scratch
69	.register	%g3,#scratch
70
71	.text
72	.align		64
73
74	.globl	FUNC_NAME
75	.type	FUNC_NAME,#function
76FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
77	srlx		%o2, 31, %g2
78	cmp		%g2, 0
79	tne		%xcc, 5
80	PREAMBLE
81	mov		%o0, GLOBAL_SPARE
82	cmp		%o2, 0
83	be,pn		%XCC, 85f
84	 or		%o0, %o1, %o3
85	cmp		%o2, 16
86	blu,a,pn	%XCC, 80f
87	 or		%o3, %o2, %o3
88
89	/* 2 blocks (128 bytes) is the minimum we can do the block
90	 * copy with.  We need to ensure that we'll iterate at least
91	 * once in the block copy loop.  At worst we'll need to align
92	 * the destination to a 64-byte boundary which can chew up
93	 * to (64 - 1) bytes from the length before we perform the
94	 * block copy loop.
95	 */
96	cmp		%o2, (2 * 64)
97	blu,pt		%XCC, 70f
98	 andcc		%o3, 0x7, %g0
99
100	/* %o0:	dst
101	 * %o1:	src
102	 * %o2:	len  (known to be >= 128)
103	 *
104	 * The block copy loops will use %o4/%o5,%g2/%g3 as
105	 * temporaries while copying the data.
106	 */
107
108	LOAD(prefetch, %o1, #one_read)
109	wr		%g0, STORE_ASI, %asi
110
111	/* Align destination on 64-byte boundary.  */
112	andcc		%o0, (64 - 1), %o4
113	be,pt		%XCC, 2f
114	 sub		%o4, 64, %o4
115	sub		%g0, %o4, %o4	! bytes to align dst
116	sub		%o2, %o4, %o2
1171:	subcc		%o4, 1, %o4
118	EX_LD(LOAD(ldub, %o1, %g1))
119	EX_ST(STORE(stb, %g1, %o0))
120	add		%o1, 1, %o1
121	bne,pt		%XCC, 1b
122	add		%o0, 1, %o0
123
124	/* If the source is on a 16-byte boundary we can do
125	 * the direct block copy loop.  If it is 8-byte aligned
126	 * we can do the 16-byte loads offset by -8 bytes and the
127	 * init stores offset by one register.
128	 *
129	 * If the source is not even 8-byte aligned, we need to do
130	 * shifting and masking (basically integer faligndata).
131	 *
132	 * The careful bit with init stores is that if we store
133	 * to any part of the cache line we have to store the whole
134	 * cacheline else we can end up with corrupt L2 cache line
135	 * contents.  Since the loop works on 64-bytes of 64-byte
136	 * aligned store data at a time, this is easy to ensure.
137	 */
1382:
139	andcc		%o1, (16 - 1), %o4
140	andn		%o2, (64 - 1), %g1	! block copy loop iterator
141	sub		%o2, %g1, %o2		! final sub-block copy bytes
142	be,pt		%XCC, 50f
143	 cmp		%o4, 8
144	be,a,pt		%XCC, 10f
145	 sub		%o1, 0x8, %o1
146
147	/* Neither 8-byte nor 16-byte aligned, shift and mask.  */
148	mov		%g1, %o4
149	and		%o1, 0x7, %g1
150	sll		%g1, 3, %g1
151	mov		64, %o3
152	andn		%o1, 0x7, %o1
153	EX_LD(LOAD(ldx, %o1, %g2))
154	sub		%o3, %g1, %o3
155	sllx		%g2, %g1, %g2
156
157#define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
158	EX_LD(LOAD(ldx, SRC, TMP1)); \
159	srlx		TMP1, PRE_SHIFT, TMP2; \
160	or		TMP2, PRE_VAL, TMP2; \
161	EX_ST(STORE_INIT(TMP2, DST)); \
162	sllx		TMP1, POST_SHIFT, PRE_VAL;
163
1641:	add		%o1, 0x8, %o1
165	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
166	add		%o1, 0x8, %o1
167	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
168	add		%o1, 0x8, %o1
169	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
170	add		%o1, 0x8, %o1
171	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
172	add		%o1, 32, %o1
173	LOAD(prefetch, %o1, #one_read)
174	sub		%o1, 32 - 8, %o1
175	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
176	add		%o1, 8, %o1
177	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
178	add		%o1, 8, %o1
179	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
180	add		%o1, 8, %o1
181	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
182	subcc		%o4, 64, %o4
183	bne,pt		%XCC, 1b
184	 add		%o0, 64, %o0
185
186#undef SWIVEL_ONE_DWORD
187
188	srl		%g1, 3, %g1
189	ba,pt		%XCC, 60f
190	 add		%o1, %g1, %o1
191
19210:	/* Destination is 64-byte aligned, source was only 8-byte
193	 * aligned but it has been subtracted by 8 and we perform
194	 * one twin load ahead, then add 8 back into source when
195	 * we finish the loop.
196	 */
197	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
1981:	add		%o1, 16, %o1
199	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
200	add		%o1, 16 + 32, %o1
201	LOAD(prefetch, %o1, #one_read)
202	sub		%o1, 32, %o1
203	EX_ST(STORE_INIT(%o5, %o0 + 0x00))	! initializes cache line
204	EX_ST(STORE_INIT(%g2, %o0 + 0x08))
205	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
206	add		%o1, 16, %o1
207	EX_ST(STORE_INIT(%g3, %o0 + 0x10))
208	EX_ST(STORE_INIT(%o4, %o0 + 0x18))
209	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
210	add		%o1, 16, %o1
211	EX_ST(STORE_INIT(%o5, %o0 + 0x20))
212	EX_ST(STORE_INIT(%g2, %o0 + 0x28))
213	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
214	EX_ST(STORE_INIT(%g3, %o0 + 0x30))
215	EX_ST(STORE_INIT(%o4, %o0 + 0x38))
216	subcc		%g1, 64, %g1
217	bne,pt		%XCC, 1b
218	 add		%o0, 64, %o0
219
220	ba,pt		%XCC, 60f
221	 add		%o1, 0x8, %o1
222
22350:	/* Destination is 64-byte aligned, and source is 16-byte
224	 * aligned.
225	 */
2261:	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
227	add	%o1, 16, %o1
228	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
229	add	%o1, 16 + 32, %o1
230	LOAD(prefetch, %o1, #one_read)
231	sub	%o1, 32, %o1
232	EX_ST(STORE_INIT(%o4, %o0 + 0x00))	! initializes cache line
233	EX_ST(STORE_INIT(%o5, %o0 + 0x08))
234	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
235	add	%o1, 16, %o1
236	EX_ST(STORE_INIT(%g2, %o0 + 0x10))
237	EX_ST(STORE_INIT(%g3, %o0 + 0x18))
238	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
239	add	%o1, 16, %o1
240	EX_ST(STORE_INIT(%o4, %o0 + 0x20))
241	EX_ST(STORE_INIT(%o5, %o0 + 0x28))
242	EX_ST(STORE_INIT(%g2, %o0 + 0x30))
243	EX_ST(STORE_INIT(%g3, %o0 + 0x38))
244	subcc	%g1, 64, %g1
245	bne,pt	%XCC, 1b
246	 add	%o0, 64, %o0
247	/* fall through */
248
24960:
250	membar		#Sync
251
252	/* %o2 contains any final bytes still needed to be copied
253	 * over. If anything is left, we copy it one byte at a time.
254	 */
255	RESTORE_ASI(%o3)
256	brz,pt		%o2, 85f
257	 sub		%o0, %o1, %o3
258	ba,a,pt		%XCC, 90f
259
260	.align		64
26170: /* 16 < len <= 64 */
262	bne,pn		%XCC, 75f
263	 sub		%o0, %o1, %o3
264
26572:
266	andn		%o2, 0xf, %o4
267	and		%o2, 0xf, %o2
2681:	subcc		%o4, 0x10, %o4
269	EX_LD(LOAD(ldx, %o1, %o5))
270	add		%o1, 0x08, %o1
271	EX_LD(LOAD(ldx, %o1, %g1))
272	sub		%o1, 0x08, %o1
273	EX_ST(STORE(stx, %o5, %o1 + %o3))
274	add		%o1, 0x8, %o1
275	EX_ST(STORE(stx, %g1, %o1 + %o3))
276	bgu,pt		%XCC, 1b
277	 add		%o1, 0x8, %o1
27873:	andcc		%o2, 0x8, %g0
279	be,pt		%XCC, 1f
280	 nop
281	sub		%o2, 0x8, %o2
282	EX_LD(LOAD(ldx, %o1, %o5))
283	EX_ST(STORE(stx, %o5, %o1 + %o3))
284	add		%o1, 0x8, %o1
2851:	andcc		%o2, 0x4, %g0
286	be,pt		%XCC, 1f
287	 nop
288	sub		%o2, 0x4, %o2
289	EX_LD(LOAD(lduw, %o1, %o5))
290	EX_ST(STORE(stw, %o5, %o1 + %o3))
291	add		%o1, 0x4, %o1
2921:	cmp		%o2, 0
293	be,pt		%XCC, 85f
294	 nop
295	ba,pt		%xcc, 90f
296	 nop
297
29875:
299	andcc		%o0, 0x7, %g1
300	sub		%g1, 0x8, %g1
301	be,pn		%icc, 2f
302	 sub		%g0, %g1, %g1
303	sub		%o2, %g1, %o2
304
3051:	subcc		%g1, 1, %g1
306	EX_LD(LOAD(ldub, %o1, %o5))
307	EX_ST(STORE(stb, %o5, %o1 + %o3))
308	bgu,pt		%icc, 1b
309	 add		%o1, 1, %o1
310
3112:	add		%o1, %o3, %o0
312	andcc		%o1, 0x7, %g1
313	bne,pt		%icc, 8f
314	 sll		%g1, 3, %g1
315
316	cmp		%o2, 16
317	bgeu,pt		%icc, 72b
318	 nop
319	ba,a,pt		%xcc, 73b
320
3218:	mov		64, %o3
322	andn		%o1, 0x7, %o1
323	EX_LD(LOAD(ldx, %o1, %g2))
324	sub		%o3, %g1, %o3
325	andn		%o2, 0x7, %o4
326	sllx		%g2, %g1, %g2
3271:	add		%o1, 0x8, %o1
328	EX_LD(LOAD(ldx, %o1, %g3))
329	subcc		%o4, 0x8, %o4
330	srlx		%g3, %o3, %o5
331	or		%o5, %g2, %o5
332	EX_ST(STORE(stx, %o5, %o0))
333	add		%o0, 0x8, %o0
334	bgu,pt		%icc, 1b
335	 sllx		%g3, %g1, %g2
336
337	srl		%g1, 3, %g1
338	andcc		%o2, 0x7, %o2
339	be,pn		%icc, 85f
340	 add		%o1, %g1, %o1
341	ba,pt		%xcc, 90f
342	 sub		%o0, %o1, %o3
343
344	.align		64
34580: /* 0 < len <= 16 */
346	andcc		%o3, 0x3, %g0
347	bne,pn		%XCC, 90f
348	 sub		%o0, %o1, %o3
349
3501:
351	subcc		%o2, 4, %o2
352	EX_LD(LOAD(lduw, %o1, %g1))
353	EX_ST(STORE(stw, %g1, %o1 + %o3))
354	bgu,pt		%XCC, 1b
355	 add		%o1, 4, %o1
356
35785:	retl
358	 mov		EX_RETVAL(GLOBAL_SPARE), %o0
359
360	.align		32
36190:
362	subcc		%o2, 1, %o2
363	EX_LD(LOAD(ldub, %o1, %g1))
364	EX_ST(STORE(stb, %g1, %o1 + %o3))
365	bgu,pt		%XCC, 90b
366	 add		%o1, 1, %o1
367	retl
368	 mov		EX_RETVAL(GLOBAL_SPARE), %o0
369
370	.size		FUNC_NAME, .-FUNC_NAME
371