1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License.  See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Unified implementation of memcpy, memmove and the __copy_user backend.
7 *
8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10 * Copyright (C) 2002 Broadcom, Inc.
11 *   memcpy/copy_user author: Mark Vandevoorde
12 *
13 * Mnemonic names for arguments to memcpy/__copy_user
14 */
15
16/*
17 * Hack to resolve longstanding prefetch issue
18 *
19 * Prefetching may be fatal on some systems if we're prefetching beyond the
20 * end of memory on some systems.  It's also a seriously bad idea on non
21 * dma-coherent systems.
22 */
23#if !defined(CONFIG_DMA_COHERENT) || !defined(CONFIG_DMA_IP27)
24#undef CONFIG_CPU_HAS_PREFETCH
25#endif
26#ifdef CONFIG_MIPS_MALTA
27#undef CONFIG_CPU_HAS_PREFETCH
28#endif
29
30#include <asm/asm.h>
31#include <asm/asm-offsets.h>
32#include <asm/regdef.h>
33
34#define dst a0
35#define src a1
36#define len a2
37
38/*
39 * Spec
40 *
41 * memcpy copies len bytes from src to dst and sets v0 to dst.
42 * It assumes that
43 *   - src and dst don't overlap
44 *   - src is readable
45 *   - dst is writable
46 * memcpy uses the standard calling convention
47 *
48 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
49 * the number of uncopied bytes due to an exception caused by a read or write.
50 * __copy_user assumes that src and dst don't overlap, and that the call is
51 * implementing one of the following:
52 *   copy_to_user
53 *     - src is readable  (no exceptions when reading src)
54 *   copy_from_user
55 *     - dst is writable  (no exceptions when writing dst)
56 * __copy_user uses a non-standard calling convention; see
57 * include/asm-mips/uaccess.h
58 *
59 * When an exception happens on a load, the handler must
60 # ensure that all of the destination buffer is overwritten to prevent
61 * leaking information to user mode programs.
62 */
63
64/*
65 * Implementation
66 */
67
68/*
69 * The exception handler for loads requires that:
70 *  1- AT contain the address of the byte just past the end of the source
71 *     of the copy,
72 *  2- src_entry <= src < AT, and
73 *  3- (dst - src) == (dst_entry - src_entry),
74 * The _entry suffix denotes values when __copy_user was called.
75 *
76 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
77 * (2) is met by incrementing src by the number of bytes copied
78 * (3) is met by not doing loads between a pair of increments of dst and src
79 *
80 * The exception handlers for stores adjust len (if necessary) and return.
81 * These handlers do not need to overwrite any data.
82 *
83 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
84 * they're not protected.
85 */
86
87#define EXC(inst_reg,addr,handler)		\
889:	inst_reg, addr;				\
89	.section __ex_table,"a";		\
90	PTR	9b, handler;			\
91	.previous
92
93/*
94 * Only on the 64-bit kernel we can made use of 64-bit registers.
95 */
96#ifdef CONFIG_64BIT
97#define USE_DOUBLE
98#endif
99
100#ifdef USE_DOUBLE
101
102#define LOAD   ld
103#define LOADL  ldl
104#define LOADR  ldr
105#define STOREL sdl
106#define STORER sdr
107#define STORE  sd
108#define ADD    daddu
109#define SUB    dsubu
110#define SRL    dsrl
111#define SRA    dsra
112#define SLL    dsll
113#define SLLV   dsllv
114#define SRLV   dsrlv
115#define NBYTES 8
116#define LOG_NBYTES 3
117
118/*
119 * As we are sharing code base with the mips32 tree (which use the o32 ABI
120 * register definitions). We need to redefine the register definitions from
121 * the n64 ABI register naming to the o32 ABI register naming.
122 */
123#undef t0
124#undef t1
125#undef t2
126#undef t3
127#define t0	$8
128#define t1	$9
129#define t2	$10
130#define t3	$11
131#define t4	$12
132#define t5	$13
133#define t6	$14
134#define t7	$15
135
136#else
137
138#define LOAD   lw
139#define LOADL  lwl
140#define LOADR  lwr
141#define STOREL swl
142#define STORER swr
143#define STORE  sw
144#define ADD    addu
145#define SUB    subu
146#define SRL    srl
147#define SLL    sll
148#define SRA    sra
149#define SLLV   sllv
150#define SRLV   srlv
151#define NBYTES 4
152#define LOG_NBYTES 2
153
154#endif /* USE_DOUBLE */
155
156#ifdef CONFIG_CPU_LITTLE_ENDIAN
157#define LDFIRST LOADR
158#define LDREST  LOADL
159#define STFIRST STORER
160#define STREST  STOREL
161#define SHIFT_DISCARD SLLV
162#else
163#define LDFIRST LOADL
164#define LDREST  LOADR
165#define STFIRST STOREL
166#define STREST  STORER
167#define SHIFT_DISCARD SRLV
168#endif
169
170#define FIRST(unit) ((unit)*NBYTES)
171#define REST(unit)  (FIRST(unit)+NBYTES-1)
172#define UNIT(unit)  FIRST(unit)
173
174#define ADDRMASK (NBYTES-1)
175
176	.text
177	.set	noreorder
178	.set	noat
179
180/*
181 * A combined memcpy/__copy_user
182 * __copy_user sets len to 0 for success; else to an upper bound of
183 * the number of uncopied bytes.
184 * memcpy sets v0 to dst.
185 */
186	.align	5
187LEAF(__copy_user_inatomic)
188	/*
189	 * Note: dst & src may be unaligned, len may be 0
190	 * Temps
191	 */
192#define rem t8
193
194	/*
195	 * The "issue break"s below are very approximate.
196	 * Issue delays for dcache fills will perturb the schedule, as will
197	 * load queue full replay traps, etc.
198	 *
199	 * If len < NBYTES use byte operations.
200	 */
201	PREF(	0, 0(src) )
202	PREF(	1, 0(dst) )
203	sltu	t2, len, NBYTES
204	and	t1, dst, ADDRMASK
205	PREF(	0, 1*32(src) )
206	PREF(	1, 1*32(dst) )
207	bnez	t2, copy_bytes_checklen
208	 and	t0, src, ADDRMASK
209	PREF(	0, 2*32(src) )
210	PREF(	1, 2*32(dst) )
211	bnez	t1, dst_unaligned
212	 nop
213	bnez	t0, src_unaligned_dst_aligned
214	/*
215	 * use delay slot for fall-through
216	 * src and dst are aligned; need to compute rem
217	 */
218both_aligned:
219	 SRL	t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
220	beqz	t0, cleanup_both_aligned # len < 8*NBYTES
221	 and	rem, len, (8*NBYTES-1)	 # rem = len % (8*NBYTES)
222	PREF(	0, 3*32(src) )
223	PREF(	1, 3*32(dst) )
224	.align	4
2251:
226EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
227EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
228EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
229EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
230	SUB	len, len, 8*NBYTES
231EXC(	LOAD	t4, UNIT(4)(src),	l_exc_copy)
232EXC(	LOAD	t7, UNIT(5)(src),	l_exc_copy)
233	STORE	t0, UNIT(0)(dst)
234	STORE	t1, UNIT(1)(dst)
235EXC(	LOAD	t0, UNIT(6)(src),	l_exc_copy)
236EXC(	LOAD	t1, UNIT(7)(src),	l_exc_copy)
237	ADD	src, src, 8*NBYTES
238	ADD	dst, dst, 8*NBYTES
239	STORE	t2, UNIT(-6)(dst)
240	STORE	t3, UNIT(-5)(dst)
241	STORE	t4, UNIT(-4)(dst)
242	STORE	t7, UNIT(-3)(dst)
243	STORE	t0, UNIT(-2)(dst)
244	STORE	t1, UNIT(-1)(dst)
245	PREF(	0, 8*32(src) )
246	PREF(	1, 8*32(dst) )
247	bne	len, rem, 1b
248	 nop
249
250	/*
251	 * len == rem == the number of bytes left to copy < 8*NBYTES
252	 */
253cleanup_both_aligned:
254	beqz	len, done
255	 sltu	t0, len, 4*NBYTES
256	bnez	t0, less_than_4units
257	 and	rem, len, (NBYTES-1)	# rem = len % NBYTES
258	/*
259	 * len >= 4*NBYTES
260	 */
261EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
262EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
263EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
264EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
265	SUB	len, len, 4*NBYTES
266	ADD	src, src, 4*NBYTES
267	STORE	t0, UNIT(0)(dst)
268	STORE	t1, UNIT(1)(dst)
269	STORE	t2, UNIT(2)(dst)
270	STORE	t3, UNIT(3)(dst)
271	beqz	len, done
272	 ADD	dst, dst, 4*NBYTES
273less_than_4units:
274	/*
275	 * rem = len % NBYTES
276	 */
277	beq	rem, len, copy_bytes
278	 nop
2791:
280EXC(	LOAD	t0, 0(src),		l_exc)
281	ADD	src, src, NBYTES
282	SUB	len, len, NBYTES
283	STORE	t0, 0(dst)
284	bne	rem, len, 1b
285	 ADD	dst, dst, NBYTES
286
287	/*
288	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
289	 * A loop would do only a byte at a time with possible branch
290	 * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
291	 * because can't assume read-access to dst.  Instead, use
292	 * STREST dst, which doesn't require read access to dst.
293	 *
294	 * This code should perform better than a simple loop on modern,
295	 * wide-issue mips processors because the code has fewer branches and
296	 * more instruction-level parallelism.
297	 */
298#define bits t2
299	beqz	len, done
300	 ADD	t1, dst, len	# t1 is just past last byte of dst
301	li	bits, 8*NBYTES
302	SLL	rem, len, 3	# rem = number of bits to keep
303EXC(	LOAD	t0, 0(src),		l_exc)
304	SUB	bits, bits, rem	# bits = number of bits to discard
305	SHIFT_DISCARD t0, t0, bits
306	STREST	t0, -1(t1)
307	jr	ra
308	 move	len, zero
309dst_unaligned:
310	/*
311	 * dst is unaligned
312	 * t0 = src & ADDRMASK
313	 * t1 = dst & ADDRMASK; T1 > 0
314	 * len >= NBYTES
315	 *
316	 * Copy enough bytes to align dst
317	 * Set match = (src and dst have same alignment)
318	 */
319#define match rem
320EXC(	LDFIRST	t3, FIRST(0)(src),	l_exc)
321	ADD	t2, zero, NBYTES
322EXC(	LDREST	t3, REST(0)(src),	l_exc_copy)
323	SUB	t2, t2, t1	# t2 = number of bytes copied
324	xor	match, t0, t1
325	STFIRST t3, FIRST(0)(dst)
326	beq	len, t2, done
327	 SUB	len, len, t2
328	ADD	dst, dst, t2
329	beqz	match, both_aligned
330	 ADD	src, src, t2
331
332src_unaligned_dst_aligned:
333	SRL	t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
334	PREF(	0, 3*32(src) )
335	beqz	t0, cleanup_src_unaligned
336	 and	rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
337	PREF(	1, 3*32(dst) )
3381:
339/*
340 * Avoid consecutive LD*'s to the same register since some mips
341 * implementations can't issue them in the same cycle.
342 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
343 * are to the same unit (unless src is aligned, but it's not).
344 */
345EXC(	LDFIRST	t0, FIRST(0)(src),	l_exc)
346EXC(	LDFIRST	t1, FIRST(1)(src),	l_exc_copy)
347	SUB     len, len, 4*NBYTES
348EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
349EXC(	LDREST	t1, REST(1)(src),	l_exc_copy)
350EXC(	LDFIRST	t2, FIRST(2)(src),	l_exc_copy)
351EXC(	LDFIRST	t3, FIRST(3)(src),	l_exc_copy)
352EXC(	LDREST	t2, REST(2)(src),	l_exc_copy)
353EXC(	LDREST	t3, REST(3)(src),	l_exc_copy)
354	PREF(	0, 9*32(src) )		# 0 is PREF_LOAD  (not streamed)
355	ADD	src, src, 4*NBYTES
356#ifdef CONFIG_CPU_SB1
357	nop				# improves slotting
358#endif
359	STORE	t0, UNIT(0)(dst)
360	STORE	t1, UNIT(1)(dst)
361	STORE	t2, UNIT(2)(dst)
362	STORE	t3, UNIT(3)(dst)
363	PREF(	1, 9*32(dst) )     	# 1 is PREF_STORE (not streamed)
364	bne	len, rem, 1b
365	 ADD	dst, dst, 4*NBYTES
366
367cleanup_src_unaligned:
368	beqz	len, done
369	 and	rem, len, NBYTES-1  # rem = len % NBYTES
370	beq	rem, len, copy_bytes
371	 nop
3721:
373EXC(	LDFIRST t0, FIRST(0)(src),	l_exc)
374EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
375	ADD	src, src, NBYTES
376	SUB	len, len, NBYTES
377	STORE	t0, 0(dst)
378	bne	len, rem, 1b
379	 ADD	dst, dst, NBYTES
380
381copy_bytes_checklen:
382	beqz	len, done
383	 nop
384copy_bytes:
385	/* 0 < len < NBYTES  */
386#define COPY_BYTE(N)			\
387EXC(	lb	t0, N(src), l_exc);	\
388	SUB	len, len, 1;		\
389	beqz	len, done;		\
390	 sb	t0, N(dst)
391
392	COPY_BYTE(0)
393	COPY_BYTE(1)
394#ifdef USE_DOUBLE
395	COPY_BYTE(2)
396	COPY_BYTE(3)
397	COPY_BYTE(4)
398	COPY_BYTE(5)
399#endif
400EXC(	lb	t0, NBYTES-2(src), l_exc)
401	SUB	len, len, 1
402	jr	ra
403	 sb	t0, NBYTES-2(dst)
404done:
405	jr	ra
406	 nop
407	END(__copy_user_inatomic)
408
409l_exc_copy:
410	/*
411	 * Copy bytes from src until faulting load address (or until a
412	 * lb faults)
413	 *
414	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
415	 * may be more than a byte beyond the last address.
416	 * Hence, the lb below may get an exception.
417	 *
418	 * Assumes src < THREAD_BUADDR($28)
419	 */
420	LOAD	t0, TI_TASK($28)
421	 nop
422	LOAD	t0, THREAD_BUADDR(t0)
4231:
424EXC(	lb	t1, 0(src),	l_exc)
425	ADD	src, src, 1
426	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
427	bne	src, t0, 1b
428	 ADD	dst, dst, 1
429l_exc:
430	LOAD	t0, TI_TASK($28)
431	 nop
432	LOAD	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
433	 nop
434	SUB	len, AT, t0		# len number of uncopied bytes
435	jr	ra
436	 nop
437