1/*
2 *    Optimized memory copy routines.
3 *
4 *    Copyright (C) 2004 Randolph Chung <tausq@debian.org>
5 *
6 *    This program is free software; you can redistribute it and/or modify
7 *    it under the terms of the GNU General Public License as published by
8 *    the Free Software Foundation; either version 2, or (at your option)
9 *    any later version.
10 *
11 *    This program is distributed in the hope that it will be useful,
12 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
13 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 *    GNU General Public License for more details.
15 *
16 *    You should have received a copy of the GNU General Public License
17 *    along with this program; if not, write to the Free Software
18 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 *    Portions derived from the GNU C Library
21 *    Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
22 *
23 * Several strategies are tried to try to get the best performance for various
24 * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
25 * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
26 * general registers.  Unaligned copies are handled either by aligning the
27 * destination and then using shift-and-write method, or in a few cases by
28 * falling back to a byte-at-a-time copy.
29 *
30 * I chose to implement this in C because it is easier to maintain and debug,
31 * and in my experiments it appears that the C code generated by gcc (3.3/3.4
32 * at the time of writing) is fairly optimal. Unfortunately some of the
33 * semantics of the copy routine (exception handling) is difficult to express
34 * in C, so we have to play some tricks to get it to work.
35 *
36 * All the loads and stores are done via explicit asm() code in order to use
37 * the right space registers.
38 *
39 * Testing with various alignments and buffer sizes shows that this code is
40 * often >10x faster than a simple byte-at-a-time copy, even for strangely
41 * aligned operands. It is interesting to note that the glibc version
42 * of memcpy (written in C) is actually quite fast already. This routine is
43 * able to beat it by 30-40% for aligned copies because of the loop unrolling,
44 * but in some cases the glibc version is still slightly faster. This lends
45 * more credibility that gcc can generate very good code as long as we are
46 * careful.
47 *
48 * TODO:
49 * - cache prefetching needs more experimentation to get optimal settings
50 * - try not to use the post-increment address modifiers; they create additional
51 *   interlocks
52 * - replace byte-copy loops with stybs sequences
53 */
54
55#ifdef __KERNEL__
56#include <linux/module.h>
57#include <linux/compiler.h>
58#include <asm/uaccess.h>
59#define s_space "%%sr1"
60#define d_space "%%sr2"
61#else
62#include "memcpy.h"
63#define s_space "%%sr0"
64#define d_space "%%sr0"
65#define pa_memcpy new2_copy
66#endif
67
68DECLARE_PER_CPU(struct exception_data, exception_data);
69
70#define preserve_branch(label)	do {					\
71	volatile int dummy;						\
72	/* The following branch is never taken, it's just here to  */	\
73	/* prevent gcc from optimizing away our exception code. */ 	\
74	if (unlikely(dummy != dummy))					\
75		goto label;						\
76} while (0)
77
78#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
79#define get_kernel_space() (0)
80
81#define MERGE(w0, sh_1, w1, sh_2)  ({					\
82	unsigned int _r;						\
83	asm volatile (							\
84	"mtsar %3\n"							\
85	"shrpw %1, %2, %%sar, %0\n"					\
86	: "=r"(_r)							\
87	: "r"(w0), "r"(w1), "r"(sh_2)					\
88	);								\
89	_r;								\
90})
91#define THRESHOLD	16
92
93#ifdef DEBUG_MEMCPY
94#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
95#else
96#define DPRINTF(fmt, args...)
97#endif
98
99#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)	\
100	__asm__ __volatile__ (				\
101	"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t"	\
102	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
103	: _tt(_t), "+r"(_a)				\
104	: 						\
105	: "r8")
106
107#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) 	\
108	__asm__ __volatile__ (				\
109	"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t"	\
110	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
111	: "+r"(_a) 					\
112	: _tt(_t)					\
113	: "r8")
114
115#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
116#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
117#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
118#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
119#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
120#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
121
122#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) 	\
123	__asm__ __volatile__ (				\
124	"1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t"	\
125	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
126	: _tt(_t) 					\
127	: "r"(_a)					\
128	: "r8")
129
130#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) 	\
131	__asm__ __volatile__ (				\
132	"1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t" 	\
133	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
134	: 						\
135	: _tt(_t), "r"(_a)				\
136	: "r8")
137
138#define ldw(_s,_o,_a,_t,_e)	def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
139#define stw(_s,_t,_o,_a,_e) 	def_store_insn(stw,"r",_s,_t,_o,_a,_e)
140
141#ifdef  CONFIG_PREFETCH
142extern inline void prefetch_src(const void *addr)
143{
144	__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
145}
146
147extern inline void prefetch_dst(const void *addr)
148{
149	__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
150}
151#else
152#define prefetch_src(addr) do { } while(0)
153#define prefetch_dst(addr) do { } while(0)
154#endif
155
156/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
157 * per loop.  This code is derived from glibc.
158 */
159static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
160{
161	/* gcc complains that a2 and a3 may be uninitialized, but actually
162	 * they cannot be.  Initialize a2/a3 to shut gcc up.
163	 */
164	register unsigned int a0, a1, a2 = 0, a3 = 0;
165	int sh_1, sh_2;
166	struct exception_data *d;
167
168	/* prefetch_src((const void *)src); */
169
170	/* Calculate how to shift a word read at the memory operation
171	   aligned srcp to make it aligned for copy.  */
172	sh_1 = 8 * (src % sizeof(unsigned int));
173	sh_2 = 8 * sizeof(unsigned int) - sh_1;
174
175	/* Make src aligned by rounding it down.  */
176	src &= -sizeof(unsigned int);
177
178	switch (len % 4)
179	{
180		case 2:
181			/* a1 = ((unsigned int *) src)[0];
182			   a2 = ((unsigned int *) src)[1]; */
183			ldw(s_space, 0, src, a1, cda_ldw_exc);
184			ldw(s_space, 4, src, a2, cda_ldw_exc);
185			src -= 1 * sizeof(unsigned int);
186			dst -= 3 * sizeof(unsigned int);
187			len += 2;
188			goto do1;
189		case 3:
190			/* a0 = ((unsigned int *) src)[0];
191			   a1 = ((unsigned int *) src)[1]; */
192			ldw(s_space, 0, src, a0, cda_ldw_exc);
193			ldw(s_space, 4, src, a1, cda_ldw_exc);
194			src -= 0 * sizeof(unsigned int);
195			dst -= 2 * sizeof(unsigned int);
196			len += 1;
197			goto do2;
198		case 0:
199			if (len == 0)
200				return 0;
201			/* a3 = ((unsigned int *) src)[0];
202			   a0 = ((unsigned int *) src)[1]; */
203			ldw(s_space, 0, src, a3, cda_ldw_exc);
204			ldw(s_space, 4, src, a0, cda_ldw_exc);
205			src -=-1 * sizeof(unsigned int);
206			dst -= 1 * sizeof(unsigned int);
207			len += 0;
208			goto do3;
209		case 1:
210			/* a2 = ((unsigned int *) src)[0];
211			   a3 = ((unsigned int *) src)[1]; */
212			ldw(s_space, 0, src, a2, cda_ldw_exc);
213			ldw(s_space, 4, src, a3, cda_ldw_exc);
214			src -=-2 * sizeof(unsigned int);
215			dst -= 0 * sizeof(unsigned int);
216			len -= 1;
217			if (len == 0)
218				goto do0;
219			goto do4;			/* No-op.  */
220	}
221
222	do
223	{
224		/* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
225do4:
226		/* a0 = ((unsigned int *) src)[0]; */
227		ldw(s_space, 0, src, a0, cda_ldw_exc);
228		/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
229		stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
230do3:
231		/* a1 = ((unsigned int *) src)[1]; */
232		ldw(s_space, 4, src, a1, cda_ldw_exc);
233		/* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
234		stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
235do2:
236		/* a2 = ((unsigned int *) src)[2]; */
237		ldw(s_space, 8, src, a2, cda_ldw_exc);
238		/* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
239		stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
240do1:
241		/* a3 = ((unsigned int *) src)[3]; */
242		ldw(s_space, 12, src, a3, cda_ldw_exc);
243		/* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
244		stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
245
246		src += 4 * sizeof(unsigned int);
247		dst += 4 * sizeof(unsigned int);
248		len -= 4;
249	}
250	while (len != 0);
251
252do0:
253	/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
254	stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
255
256	preserve_branch(handle_load_error);
257	preserve_branch(handle_store_error);
258
259	return 0;
260
261handle_load_error:
262	__asm__ __volatile__ ("cda_ldw_exc:\n");
263	d = &__get_cpu_var(exception_data);
264	DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
265		o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
266	return o_len * 4 - d->fault_addr + o_src;
267
268handle_store_error:
269	__asm__ __volatile__ ("cda_stw_exc:\n");
270	d = &__get_cpu_var(exception_data);
271	DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
272		o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
273	return o_len * 4 - d->fault_addr + o_dst;
274}
275
276
277/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
278unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
279{
280	register unsigned long src, dst, t1, t2, t3;
281	register unsigned char *pcs, *pcd;
282	register unsigned int *pws, *pwd;
283	register double *pds, *pdd;
284	unsigned long ret = 0;
285	unsigned long o_dst, o_src, o_len;
286	struct exception_data *d;
287
288	src = (unsigned long)srcp;
289	dst = (unsigned long)dstp;
290	pcs = (unsigned char *)srcp;
291	pcd = (unsigned char *)dstp;
292
293	o_dst = dst; o_src = src; o_len = len;
294
295	/* prefetch_src((const void *)srcp); */
296
297	if (len < THRESHOLD)
298		goto byte_copy;
299
300	/* Check alignment */
301	t1 = (src ^ dst);
302	if (unlikely(t1 & (sizeof(double)-1)))
303		goto unaligned_copy;
304
305	/* src and dst have same alignment. */
306
307	/* Copy bytes till we are double-aligned. */
308	t2 = src & (sizeof(double) - 1);
309	if (unlikely(t2 != 0)) {
310		t2 = sizeof(double) - t2;
311		while (t2 && len) {
312			/* *pcd++ = *pcs++; */
313			ldbma(s_space, pcs, t3, pmc_load_exc);
314			len--;
315			stbma(d_space, t3, pcd, pmc_store_exc);
316			t2--;
317		}
318	}
319
320	pds = (double *)pcs;
321	pdd = (double *)pcd;
322
323
324	pws = (unsigned int *)pds;
325	pwd = (unsigned int *)pdd;
326
327word_copy:
328	while (len >= 8*sizeof(unsigned int)) {
329		register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
330		/* prefetch_src((char *)pws + L1_CACHE_BYTES); */
331		ldwma(s_space, pws, r1, pmc_load_exc);
332		ldwma(s_space, pws, r2, pmc_load_exc);
333		ldwma(s_space, pws, r3, pmc_load_exc);
334		ldwma(s_space, pws, r4, pmc_load_exc);
335		stwma(d_space, r1, pwd, pmc_store_exc);
336		stwma(d_space, r2, pwd, pmc_store_exc);
337		stwma(d_space, r3, pwd, pmc_store_exc);
338		stwma(d_space, r4, pwd, pmc_store_exc);
339
340		ldwma(s_space, pws, r5, pmc_load_exc);
341		ldwma(s_space, pws, r6, pmc_load_exc);
342		ldwma(s_space, pws, r7, pmc_load_exc);
343		ldwma(s_space, pws, r8, pmc_load_exc);
344		stwma(d_space, r5, pwd, pmc_store_exc);
345		stwma(d_space, r6, pwd, pmc_store_exc);
346		stwma(d_space, r7, pwd, pmc_store_exc);
347		stwma(d_space, r8, pwd, pmc_store_exc);
348		len -= 8*sizeof(unsigned int);
349	}
350
351	while (len >= 4*sizeof(unsigned int)) {
352		register unsigned int r1,r2,r3,r4;
353		ldwma(s_space, pws, r1, pmc_load_exc);
354		ldwma(s_space, pws, r2, pmc_load_exc);
355		ldwma(s_space, pws, r3, pmc_load_exc);
356		ldwma(s_space, pws, r4, pmc_load_exc);
357		stwma(d_space, r1, pwd, pmc_store_exc);
358		stwma(d_space, r2, pwd, pmc_store_exc);
359		stwma(d_space, r3, pwd, pmc_store_exc);
360		stwma(d_space, r4, pwd, pmc_store_exc);
361		len -= 4*sizeof(unsigned int);
362	}
363
364	pcs = (unsigned char *)pws;
365	pcd = (unsigned char *)pwd;
366
367byte_copy:
368	while (len) {
369		/* *pcd++ = *pcs++; */
370		ldbma(s_space, pcs, t3, pmc_load_exc);
371		stbma(d_space, t3, pcd, pmc_store_exc);
372		len--;
373	}
374
375	return 0;
376
377unaligned_copy:
378	/* possibly we are aligned on a word, but not on a double... */
379	if (likely(t1 & (sizeof(unsigned int)-1)) == 0) {
380		t2 = src & (sizeof(unsigned int) - 1);
381
382		if (unlikely(t2 != 0)) {
383			t2 = sizeof(unsigned int) - t2;
384			while (t2) {
385				/* *pcd++ = *pcs++; */
386				ldbma(s_space, pcs, t3, pmc_load_exc);
387				stbma(d_space, t3, pcd, pmc_store_exc);
388				len--;
389				t2--;
390			}
391		}
392
393		pws = (unsigned int *)pcs;
394		pwd = (unsigned int *)pcd;
395		goto word_copy;
396	}
397
398	/* Align the destination.  */
399	if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
400		t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
401		while (t2) {
402			/* *pcd++ = *pcs++; */
403			ldbma(s_space, pcs, t3, pmc_load_exc);
404			stbma(d_space, t3, pcd, pmc_store_exc);
405			len--;
406			t2--;
407		}
408		dst = (unsigned long)pcd;
409		src = (unsigned long)pcs;
410	}
411
412	ret = copy_dstaligned(dst, src, len / sizeof(unsigned int),
413		o_dst, o_src, o_len);
414	if (ret)
415		return ret;
416
417	pcs += (len & -sizeof(unsigned int));
418	pcd += (len & -sizeof(unsigned int));
419	len %= sizeof(unsigned int);
420
421	preserve_branch(handle_load_error);
422	preserve_branch(handle_store_error);
423
424	goto byte_copy;
425
426handle_load_error:
427	__asm__ __volatile__ ("pmc_load_exc:\n");
428	d = &__get_cpu_var(exception_data);
429	DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
430		o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
431	return o_len - d->fault_addr + o_src;
432
433handle_store_error:
434	__asm__ __volatile__ ("pmc_store_exc:\n");
435	d = &__get_cpu_var(exception_data);
436	DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
437		o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
438	return o_len - d->fault_addr + o_dst;
439}
440
441#ifdef __KERNEL__
442unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
443{
444	mtsp(get_kernel_space(), 1);
445	mtsp(get_user_space(), 2);
446	return pa_memcpy((void __force *)dst, src, len);
447}
448
449unsigned long copy_from_user(void *dst, const void __user *src, unsigned long len)
450{
451	mtsp(get_user_space(), 1);
452	mtsp(get_kernel_space(), 2);
453	return pa_memcpy(dst, (void __force *)src, len);
454}
455
456unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
457{
458	mtsp(get_user_space(), 1);
459	mtsp(get_user_space(), 2);
460	return pa_memcpy((void __force *)dst, (void __force *)src, len);
461}
462
463
464void * memcpy(void * dst,const void *src, size_t count)
465{
466	mtsp(get_kernel_space(), 1);
467	mtsp(get_kernel_space(), 2);
468	pa_memcpy(dst, src, count);
469	return dst;
470}
471
472EXPORT_SYMBOL(copy_to_user);
473EXPORT_SYMBOL(copy_from_user);
474EXPORT_SYMBOL(copy_in_user);
475EXPORT_SYMBOL(memcpy);
476#endif
477