1/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15
16#define COPY_16_BYTES		\
17	lwz	r7,4(r4);	\
18	lwz	r8,8(r4);	\
19	lwz	r9,12(r4);	\
20	lwzu	r10,16(r4);	\
21	stw	r7,4(r6);	\
22	stw	r8,8(r6);	\
23	stw	r9,12(r6);	\
24	stwu	r10,16(r6)
25
26#define COPY_16_BYTES_WITHEX(n)	\
278 ## n ## 0:			\
28	lwz	r7,4(r4);	\
298 ## n ## 1:			\
30	lwz	r8,8(r4);	\
318 ## n ## 2:			\
32	lwz	r9,12(r4);	\
338 ## n ## 3:			\
34	lwzu	r10,16(r4);	\
358 ## n ## 4:			\
36	stw	r7,4(r6);	\
378 ## n ## 5:			\
38	stw	r8,8(r6);	\
398 ## n ## 6:			\
40	stw	r9,12(r6);	\
418 ## n ## 7:			\
42	stwu	r10,16(r6)
43
44#define COPY_16_BYTES_EXCODE(n)			\
459 ## n ## 0:					\
46	addi	r5,r5,-(16 * n);		\
47	b	104f;				\
489 ## n ## 1:					\
49	addi	r5,r5,-(16 * n);		\
50	b	105f;				\
51.section __ex_table,"a";			\
52	.align	2;				\
53	.long	8 ## n ## 0b,9 ## n ## 0b;	\
54	.long	8 ## n ## 1b,9 ## n ## 0b;	\
55	.long	8 ## n ## 2b,9 ## n ## 0b;	\
56	.long	8 ## n ## 3b,9 ## n ## 0b;	\
57	.long	8 ## n ## 4b,9 ## n ## 1b;	\
58	.long	8 ## n ## 5b,9 ## n ## 1b;	\
59	.long	8 ## n ## 6b,9 ## n ## 1b;	\
60	.long	8 ## n ## 7b,9 ## n ## 1b;	\
61	.text
62
63	.text
64	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
65	.stabs	"copy32.S",N_SO,0,0,0f
660:
67
68CACHELINE_BYTES = L1_CACHE_BYTES
69LG_CACHELINE_BYTES = L1_CACHE_SHIFT
70CACHELINE_MASK = (L1_CACHE_BYTES-1)
71
72/*
73 * Use dcbz on the complete cache lines in the destination
74 * to set them to zero.  This requires that the destination
75 * area is cacheable.  -- paulus
76 */
77_GLOBAL(cacheable_memzero)
78	mr	r5,r4
79	li	r4,0
80	addi	r6,r3,-4
81	cmplwi	0,r5,4
82	blt	7f
83	stwu	r4,4(r6)
84	beqlr
85	andi.	r0,r6,3
86	add	r5,r0,r5
87	subf	r6,r0,r6
88	clrlwi	r7,r6,32-LG_CACHELINE_BYTES
89	add	r8,r7,r5
90	srwi	r9,r8,LG_CACHELINE_BYTES
91	addic.	r9,r9,-1	/* total number of complete cachelines */
92	ble	2f
93	xori	r0,r7,CACHELINE_MASK & ~3
94	srwi.	r0,r0,2
95	beq	3f
96	mtctr	r0
974:	stwu	r4,4(r6)
98	bdnz	4b
993:	mtctr	r9
100	li	r7,4
101#if !defined(CONFIG_8xx)
10210:	dcbz	r7,r6
103#else
10410:	stw	r4, 4(r6)
105	stw	r4, 8(r6)
106	stw	r4, 12(r6)
107	stw	r4, 16(r6)
108#if CACHE_LINE_SIZE >= 32
109	stw	r4, 20(r6)
110	stw	r4, 24(r6)
111	stw	r4, 28(r6)
112	stw	r4, 32(r6)
113#endif /* CACHE_LINE_SIZE */
114#endif
115	addi	r6,r6,CACHELINE_BYTES
116	bdnz	10b
117	clrlwi	r5,r8,32-LG_CACHELINE_BYTES
118	addi	r5,r5,4
1192:	srwi	r0,r5,2
120	mtctr	r0
121	bdz	6f
1221:	stwu	r4,4(r6)
123	bdnz	1b
1246:	andi.	r5,r5,3
1257:	cmpwi	0,r5,0
126	beqlr
127	mtctr	r5
128	addi	r6,r6,3
1298:	stbu	r4,1(r6)
130	bdnz	8b
131	blr
132
133_GLOBAL(memset)
134	rlwimi	r4,r4,8,16,23
135	rlwimi	r4,r4,16,0,15
136	addi	r6,r3,-4
137	cmplwi	0,r5,4
138	blt	7f
139	stwu	r4,4(r6)
140	beqlr
141	andi.	r0,r6,3
142	add	r5,r0,r5
143	subf	r6,r0,r6
144	srwi	r0,r5,2
145	mtctr	r0
146	bdz	6f
1471:	stwu	r4,4(r6)
148	bdnz	1b
1496:	andi.	r5,r5,3
1507:	cmpwi	0,r5,0
151	beqlr
152	mtctr	r5
153	addi	r6,r6,3
1548:	stbu	r4,1(r6)
155	bdnz	8b
156	blr
157
158/*
159 * This version uses dcbz on the complete cache lines in the
160 * destination area to reduce memory traffic.  This requires that
161 * the destination area is cacheable.
162 * We only use this version if the source and dest don't overlap.
163 * -- paulus.
164 */
165_GLOBAL(cacheable_memcpy)
166	add	r7,r3,r5		/* test if the src & dst overlap */
167	add	r8,r4,r5
168	cmplw	0,r4,r7
169	cmplw	1,r3,r8
170	crand	0,0,4			/* cr0.lt &= cr1.lt */
171	blt	memcpy			/* if regions overlap */
172
173	addi	r4,r4,-4
174	addi	r6,r3,-4
175	neg	r0,r3
176	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
177	beq	58f
178
179	cmplw	0,r5,r0			/* is this more than total to do? */
180	blt	63f			/* if not much to do */
181	andi.	r8,r0,3			/* get it word-aligned first */
182	subf	r5,r0,r5
183	mtctr	r8
184	beq+	61f
18570:	lbz	r9,4(r4)		/* do some bytes */
186	stb	r9,4(r6)
187	addi	r4,r4,1
188	addi	r6,r6,1
189	bdnz	70b
19061:	srwi.	r0,r0,2
191	mtctr	r0
192	beq	58f
19372:	lwzu	r9,4(r4)		/* do some words */
194	stwu	r9,4(r6)
195	bdnz	72b
196
19758:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
198	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
199	li	r11,4
200	mtctr	r0
201	beq	63f
20253:
203#if !defined(CONFIG_8xx)
204	dcbz	r11,r6
205#endif
206	COPY_16_BYTES
207#if L1_CACHE_BYTES >= 32
208	COPY_16_BYTES
209#if L1_CACHE_BYTES >= 64
210	COPY_16_BYTES
211	COPY_16_BYTES
212#if L1_CACHE_BYTES >= 128
213	COPY_16_BYTES
214	COPY_16_BYTES
215	COPY_16_BYTES
216	COPY_16_BYTES
217#endif
218#endif
219#endif
220	bdnz	53b
221
22263:	srwi.	r0,r5,2
223	mtctr	r0
224	beq	64f
22530:	lwzu	r0,4(r4)
226	stwu	r0,4(r6)
227	bdnz	30b
228
22964:	andi.	r0,r5,3
230	mtctr	r0
231	beq+	65f
23240:	lbz	r0,4(r4)
233	stb	r0,4(r6)
234	addi	r4,r4,1
235	addi	r6,r6,1
236	bdnz	40b
23765:	blr
238
239_GLOBAL(memmove)
240	cmplw	0,r3,r4
241	bgt	backwards_memcpy
242	/* fall through */
243
244_GLOBAL(memcpy)
245	srwi.	r7,r5,3
246	addi	r6,r3,-4
247	addi	r4,r4,-4
248	beq	2f			/* if less than 8 bytes to do */
249	andi.	r0,r6,3			/* get dest word aligned */
250	mtctr	r7
251	bne	5f
2521:	lwz	r7,4(r4)
253	lwzu	r8,8(r4)
254	stw	r7,4(r6)
255	stwu	r8,8(r6)
256	bdnz	1b
257	andi.	r5,r5,7
2582:	cmplwi	0,r5,4
259	blt	3f
260	lwzu	r0,4(r4)
261	addi	r5,r5,-4
262	stwu	r0,4(r6)
2633:	cmpwi	0,r5,0
264	beqlr
265	mtctr	r5
266	addi	r4,r4,3
267	addi	r6,r6,3
2684:	lbzu	r0,1(r4)
269	stbu	r0,1(r6)
270	bdnz	4b
271	blr
2725:	subfic	r0,r0,4
273	mtctr	r0
2746:	lbz	r7,4(r4)
275	addi	r4,r4,1
276	stb	r7,4(r6)
277	addi	r6,r6,1
278	bdnz	6b
279	subf	r5,r0,r5
280	rlwinm.	r7,r5,32-3,3,31
281	beq	2b
282	mtctr	r7
283	b	1b
284
285_GLOBAL(backwards_memcpy)
286	rlwinm.	r7,r5,32-3,3,31		/* r0 = r5 >> 3 */
287	add	r6,r3,r5
288	add	r4,r4,r5
289	beq	2f
290	andi.	r0,r6,3
291	mtctr	r7
292	bne	5f
2931:	lwz	r7,-4(r4)
294	lwzu	r8,-8(r4)
295	stw	r7,-4(r6)
296	stwu	r8,-8(r6)
297	bdnz	1b
298	andi.	r5,r5,7
2992:	cmplwi	0,r5,4
300	blt	3f
301	lwzu	r0,-4(r4)
302	subi	r5,r5,4
303	stwu	r0,-4(r6)
3043:	cmpwi	0,r5,0
305	beqlr
306	mtctr	r5
3074:	lbzu	r0,-1(r4)
308	stbu	r0,-1(r6)
309	bdnz	4b
310	blr
3115:	mtctr	r0
3126:	lbzu	r7,-1(r4)
313	stbu	r7,-1(r6)
314	bdnz	6b
315	subf	r5,r0,r5
316	rlwinm.	r7,r5,32-3,3,31
317	beq	2b
318	mtctr	r7
319	b	1b
320
321_GLOBAL(__copy_tofrom_user)
322	addi	r4,r4,-4
323	addi	r6,r3,-4
324	neg	r0,r3
325	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
326	beq	58f
327
328	cmplw	0,r5,r0			/* is this more than total to do? */
329	blt	63f			/* if not much to do */
330	andi.	r8,r0,3			/* get it word-aligned first */
331	mtctr	r8
332	beq+	61f
33370:	lbz	r9,4(r4)		/* do some bytes */
33471:	stb	r9,4(r6)
335	addi	r4,r4,1
336	addi	r6,r6,1
337	bdnz	70b
33861:	subf	r5,r0,r5
339	srwi.	r0,r0,2
340	mtctr	r0
341	beq	58f
34272:	lwzu	r9,4(r4)		/* do some words */
34373:	stwu	r9,4(r6)
344	bdnz	72b
345
346	.section __ex_table,"a"
347	.align	2
348	.long	70b,100f
349	.long	71b,101f
350	.long	72b,102f
351	.long	73b,103f
352	.text
353
35458:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
355	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
356	li	r11,4
357	beq	63f
358
359#ifdef CONFIG_8xx
360	/* Don't use prefetch on 8xx */
361	mtctr	r0
362	li	r0,0
36353:	COPY_16_BYTES_WITHEX(0)
364	bdnz	53b
365
366#else /* not CONFIG_8xx */
367	/* Here we decide how far ahead to prefetch the source */
368	li	r3,4
369	cmpwi	r0,1
370	li	r7,0
371	ble	114f
372	li	r7,1
373#if MAX_COPY_PREFETCH > 1
374	/* Heuristically, for large transfers we prefetch
375	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
376	   we prefetch 1 cacheline ahead. */
377	cmpwi	r0,MAX_COPY_PREFETCH
378	ble	112f
379	li	r7,MAX_COPY_PREFETCH
380112:	mtctr	r7
381111:	dcbt	r3,r4
382	addi	r3,r3,CACHELINE_BYTES
383	bdnz	111b
384#else
385	dcbt	r3,r4
386	addi	r3,r3,CACHELINE_BYTES
387#endif /* MAX_COPY_PREFETCH > 1 */
388
389114:	subf	r8,r7,r0
390	mr	r0,r7
391	mtctr	r8
392
39353:	dcbt	r3,r4
39454:	dcbz	r11,r6
395	.section __ex_table,"a"
396	.align	2
397	.long	54b,105f
398	.text
399/* the main body of the cacheline loop */
400	COPY_16_BYTES_WITHEX(0)
401#if L1_CACHE_BYTES >= 32
402	COPY_16_BYTES_WITHEX(1)
403#if L1_CACHE_BYTES >= 64
404	COPY_16_BYTES_WITHEX(2)
405	COPY_16_BYTES_WITHEX(3)
406#if L1_CACHE_BYTES >= 128
407	COPY_16_BYTES_WITHEX(4)
408	COPY_16_BYTES_WITHEX(5)
409	COPY_16_BYTES_WITHEX(6)
410	COPY_16_BYTES_WITHEX(7)
411#endif
412#endif
413#endif
414	bdnz	53b
415	cmpwi	r0,0
416	li	r3,4
417	li	r7,0
418	bne	114b
419#endif /* CONFIG_8xx */
420
42163:	srwi.	r0,r5,2
422	mtctr	r0
423	beq	64f
42430:	lwzu	r0,4(r4)
42531:	stwu	r0,4(r6)
426	bdnz	30b
427
42864:	andi.	r0,r5,3
429	mtctr	r0
430	beq+	65f
43140:	lbz	r0,4(r4)
43241:	stb	r0,4(r6)
433	addi	r4,r4,1
434	addi	r6,r6,1
435	bdnz	40b
43665:	li	r3,0
437	blr
438
439/* read fault, initial single-byte copy */
440100:	li	r9,0
441	b	90f
442/* write fault, initial single-byte copy */
443101:	li	r9,1
44490:	subf	r5,r8,r5
445	li	r3,0
446	b	99f
447/* read fault, initial word copy */
448102:	li	r9,0
449	b	91f
450/* write fault, initial word copy */
451103:	li	r9,1
45291:	li	r3,2
453	b	99f
454
455/*
456 * this stuff handles faults in the cacheline loop and branches to either
457 * 104f (if in read part) or 105f (if in write part), after updating r5
458 */
459	COPY_16_BYTES_EXCODE(0)
460#if L1_CACHE_BYTES >= 32
461	COPY_16_BYTES_EXCODE(1)
462#if L1_CACHE_BYTES >= 64
463	COPY_16_BYTES_EXCODE(2)
464	COPY_16_BYTES_EXCODE(3)
465#if L1_CACHE_BYTES >= 128
466	COPY_16_BYTES_EXCODE(4)
467	COPY_16_BYTES_EXCODE(5)
468	COPY_16_BYTES_EXCODE(6)
469	COPY_16_BYTES_EXCODE(7)
470#endif
471#endif
472#endif
473
474/* read fault in cacheline loop */
475104:	li	r9,0
476	b	92f
477/* fault on dcbz (effectively a write fault) */
478/* or write fault in cacheline loop */
479105:	li	r9,1
48092:	li	r3,LG_CACHELINE_BYTES
481	mfctr	r8
482	add	r0,r0,r8
483	b	106f
484/* read fault in final word loop */
485108:	li	r9,0
486	b	93f
487/* write fault in final word loop */
488109:	li	r9,1
48993:	andi.	r5,r5,3
490	li	r3,2
491	b	99f
492/* read fault in final byte loop */
493110:	li	r9,0
494	b	94f
495/* write fault in final byte loop */
496111:	li	r9,1
49794:	li	r5,0
498	li	r3,0
499/*
500 * At this stage the number of bytes not copied is
501 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
502 */
50399:	mfctr	r0
504106:	slw	r3,r0,r3
505	add.	r3,r3,r5
506	beq	120f			/* shouldn't happen */
507	cmpwi	0,r9,0
508	bne	120f
509/* for a read fault, first try to continue the copy one byte at a time */
510	mtctr	r3
511130:	lbz	r0,4(r4)
512131:	stb	r0,4(r6)
513	addi	r4,r4,1
514	addi	r6,r6,1
515	bdnz	130b
516/* then clear out the destination: r3 bytes starting at 4(r6) */
517132:	mfctr	r3
518	srwi.	r0,r3,2
519	li	r9,0
520	mtctr	r0
521	beq	113f
522112:	stwu	r9,4(r6)
523	bdnz	112b
524113:	andi.	r0,r3,3
525	mtctr	r0
526	beq	120f
527114:	stb	r9,4(r6)
528	addi	r6,r6,1
529	bdnz	114b
530120:	blr
531
532	.section __ex_table,"a"
533	.align	2
534	.long	30b,108b
535	.long	31b,109b
536	.long	40b,110b
537	.long	41b,111b
538	.long	130b,132b
539	.long	131b,120b
540	.long	112b,120b
541	.long	114b,120b
542	.text
543