1/* 2 * This file is subject to the terms and conditions of the GNU General Public 3 * License. See the file "COPYING" in the main directory of this archive 4 * for more details. 5 * 6 * Unified implementation of memcpy, memmove and the __copy_user backend. 7 * 8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org) 9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. 10 * Copyright (C) 2002 Broadcom, Inc. 11 * memcpy/copy_user author: Mark Vandevoorde 12 * 13 * Mnemonic names for arguments to memcpy/__copy_user 14 */ 15 16/* 17 * Hack to resolve longstanding prefetch issue 18 * 19 * Prefetching may be fatal on some systems if we're prefetching beyond the 20 * end of memory on some systems. It's also a seriously bad idea on non 21 * dma-coherent systems. 22 */ 23#if !defined(CONFIG_DMA_COHERENT) || !defined(CONFIG_DMA_IP27) 24#undef CONFIG_CPU_HAS_PREFETCH 25#endif 26#ifdef CONFIG_MIPS_MALTA 27#undef CONFIG_CPU_HAS_PREFETCH 28#endif 29 30#include <asm/asm.h> 31#include <asm/asm-offsets.h> 32#include <asm/regdef.h> 33 34#define dst a0 35#define src a1 36#define len a2 37 38/* 39 * Spec 40 * 41 * memcpy copies len bytes from src to dst and sets v0 to dst. 42 * It assumes that 43 * - src and dst don't overlap 44 * - src is readable 45 * - dst is writable 46 * memcpy uses the standard calling convention 47 * 48 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to 49 * the number of uncopied bytes due to an exception caused by a read or write. 50 * __copy_user assumes that src and dst don't overlap, and that the call is 51 * implementing one of the following: 52 * copy_to_user 53 * - src is readable (no exceptions when reading src) 54 * copy_from_user 55 * - dst is writable (no exceptions when writing dst) 56 * __copy_user uses a non-standard calling convention; see 57 * include/asm-mips/uaccess.h 58 * 59 * When an exception happens on a load, the handler must 60 # ensure that all of the destination buffer is overwritten to prevent 61 * leaking information to user mode programs. 62 */ 63 64/* 65 * Implementation 66 */ 67 68/* 69 * The exception handler for loads requires that: 70 * 1- AT contain the address of the byte just past the end of the source 71 * of the copy, 72 * 2- src_entry <= src < AT, and 73 * 3- (dst - src) == (dst_entry - src_entry), 74 * The _entry suffix denotes values when __copy_user was called. 75 * 76 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user 77 * (2) is met by incrementing src by the number of bytes copied 78 * (3) is met by not doing loads between a pair of increments of dst and src 79 * 80 * The exception handlers for stores adjust len (if necessary) and return. 81 * These handlers do not need to overwrite any data. 82 * 83 * For __rmemcpy and memmove an exception is always a kernel bug, therefore 84 * they're not protected. 85 */ 86 87#define EXC(inst_reg,addr,handler) \ 889: inst_reg, addr; \ 89 .section __ex_table,"a"; \ 90 PTR 9b, handler; \ 91 .previous 92 93/* 94 * Only on the 64-bit kernel we can made use of 64-bit registers. 95 */ 96#ifdef CONFIG_64BIT 97#define USE_DOUBLE 98#endif 99 100#ifdef USE_DOUBLE 101 102#define LOAD ld 103#define LOADL ldl 104#define LOADR ldr 105#define STOREL sdl 106#define STORER sdr 107#define STORE sd 108#define ADD daddu 109#define SUB dsubu 110#define SRL dsrl 111#define SRA dsra 112#define SLL dsll 113#define SLLV dsllv 114#define SRLV dsrlv 115#define NBYTES 8 116#define LOG_NBYTES 3 117 118/* 119 * As we are sharing code base with the mips32 tree (which use the o32 ABI 120 * register definitions). We need to redefine the register definitions from 121 * the n64 ABI register naming to the o32 ABI register naming. 122 */ 123#undef t0 124#undef t1 125#undef t2 126#undef t3 127#define t0 $8 128#define t1 $9 129#define t2 $10 130#define t3 $11 131#define t4 $12 132#define t5 $13 133#define t6 $14 134#define t7 $15 135 136#else 137 138#define LOAD lw 139#define LOADL lwl 140#define LOADR lwr 141#define STOREL swl 142#define STORER swr 143#define STORE sw 144#define ADD addu 145#define SUB subu 146#define SRL srl 147#define SLL sll 148#define SRA sra 149#define SLLV sllv 150#define SRLV srlv 151#define NBYTES 4 152#define LOG_NBYTES 2 153 154#endif /* USE_DOUBLE */ 155 156#ifdef CONFIG_CPU_LITTLE_ENDIAN 157#define LDFIRST LOADR 158#define LDREST LOADL 159#define STFIRST STORER 160#define STREST STOREL 161#define SHIFT_DISCARD SLLV 162#else 163#define LDFIRST LOADL 164#define LDREST LOADR 165#define STFIRST STOREL 166#define STREST STORER 167#define SHIFT_DISCARD SRLV 168#endif 169 170#define FIRST(unit) ((unit)*NBYTES) 171#define REST(unit) (FIRST(unit)+NBYTES-1) 172#define UNIT(unit) FIRST(unit) 173 174#define ADDRMASK (NBYTES-1) 175 176 .text 177 .set noreorder 178 .set noat 179 180/* 181 * A combined memcpy/__copy_user 182 * __copy_user sets len to 0 for success; else to an upper bound of 183 * the number of uncopied bytes. 184 * memcpy sets v0 to dst. 185 */ 186 .align 5 187LEAF(__copy_user_inatomic) 188 /* 189 * Note: dst & src may be unaligned, len may be 0 190 * Temps 191 */ 192#define rem t8 193 194 /* 195 * The "issue break"s below are very approximate. 196 * Issue delays for dcache fills will perturb the schedule, as will 197 * load queue full replay traps, etc. 198 * 199 * If len < NBYTES use byte operations. 200 */ 201 PREF( 0, 0(src) ) 202 PREF( 1, 0(dst) ) 203 sltu t2, len, NBYTES 204 and t1, dst, ADDRMASK 205 PREF( 0, 1*32(src) ) 206 PREF( 1, 1*32(dst) ) 207 bnez t2, copy_bytes_checklen 208 and t0, src, ADDRMASK 209 PREF( 0, 2*32(src) ) 210 PREF( 1, 2*32(dst) ) 211 bnez t1, dst_unaligned 212 nop 213 bnez t0, src_unaligned_dst_aligned 214 /* 215 * use delay slot for fall-through 216 * src and dst are aligned; need to compute rem 217 */ 218both_aligned: 219 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter 220 beqz t0, cleanup_both_aligned # len < 8*NBYTES 221 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) 222 PREF( 0, 3*32(src) ) 223 PREF( 1, 3*32(dst) ) 224 .align 4 2251: 226EXC( LOAD t0, UNIT(0)(src), l_exc) 227EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 228EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 229EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 230 SUB len, len, 8*NBYTES 231EXC( LOAD t4, UNIT(4)(src), l_exc_copy) 232EXC( LOAD t7, UNIT(5)(src), l_exc_copy) 233 STORE t0, UNIT(0)(dst) 234 STORE t1, UNIT(1)(dst) 235EXC( LOAD t0, UNIT(6)(src), l_exc_copy) 236EXC( LOAD t1, UNIT(7)(src), l_exc_copy) 237 ADD src, src, 8*NBYTES 238 ADD dst, dst, 8*NBYTES 239 STORE t2, UNIT(-6)(dst) 240 STORE t3, UNIT(-5)(dst) 241 STORE t4, UNIT(-4)(dst) 242 STORE t7, UNIT(-3)(dst) 243 STORE t0, UNIT(-2)(dst) 244 STORE t1, UNIT(-1)(dst) 245 PREF( 0, 8*32(src) ) 246 PREF( 1, 8*32(dst) ) 247 bne len, rem, 1b 248 nop 249 250 /* 251 * len == rem == the number of bytes left to copy < 8*NBYTES 252 */ 253cleanup_both_aligned: 254 beqz len, done 255 sltu t0, len, 4*NBYTES 256 bnez t0, less_than_4units 257 and rem, len, (NBYTES-1) # rem = len % NBYTES 258 /* 259 * len >= 4*NBYTES 260 */ 261EXC( LOAD t0, UNIT(0)(src), l_exc) 262EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 263EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 264EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 265 SUB len, len, 4*NBYTES 266 ADD src, src, 4*NBYTES 267 STORE t0, UNIT(0)(dst) 268 STORE t1, UNIT(1)(dst) 269 STORE t2, UNIT(2)(dst) 270 STORE t3, UNIT(3)(dst) 271 beqz len, done 272 ADD dst, dst, 4*NBYTES 273less_than_4units: 274 /* 275 * rem = len % NBYTES 276 */ 277 beq rem, len, copy_bytes 278 nop 2791: 280EXC( LOAD t0, 0(src), l_exc) 281 ADD src, src, NBYTES 282 SUB len, len, NBYTES 283 STORE t0, 0(dst) 284 bne rem, len, 1b 285 ADD dst, dst, NBYTES 286 287 /* 288 * src and dst are aligned, need to copy rem bytes (rem < NBYTES) 289 * A loop would do only a byte at a time with possible branch 290 * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE 291 * because can't assume read-access to dst. Instead, use 292 * STREST dst, which doesn't require read access to dst. 293 * 294 * This code should perform better than a simple loop on modern, 295 * wide-issue mips processors because the code has fewer branches and 296 * more instruction-level parallelism. 297 */ 298#define bits t2 299 beqz len, done 300 ADD t1, dst, len # t1 is just past last byte of dst 301 li bits, 8*NBYTES 302 SLL rem, len, 3 # rem = number of bits to keep 303EXC( LOAD t0, 0(src), l_exc) 304 SUB bits, bits, rem # bits = number of bits to discard 305 SHIFT_DISCARD t0, t0, bits 306 STREST t0, -1(t1) 307 jr ra 308 move len, zero 309dst_unaligned: 310 /* 311 * dst is unaligned 312 * t0 = src & ADDRMASK 313 * t1 = dst & ADDRMASK; T1 > 0 314 * len >= NBYTES 315 * 316 * Copy enough bytes to align dst 317 * Set match = (src and dst have same alignment) 318 */ 319#define match rem 320EXC( LDFIRST t3, FIRST(0)(src), l_exc) 321 ADD t2, zero, NBYTES 322EXC( LDREST t3, REST(0)(src), l_exc_copy) 323 SUB t2, t2, t1 # t2 = number of bytes copied 324 xor match, t0, t1 325 STFIRST t3, FIRST(0)(dst) 326 beq len, t2, done 327 SUB len, len, t2 328 ADD dst, dst, t2 329 beqz match, both_aligned 330 ADD src, src, t2 331 332src_unaligned_dst_aligned: 333 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter 334 PREF( 0, 3*32(src) ) 335 beqz t0, cleanup_src_unaligned 336 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES 337 PREF( 1, 3*32(dst) ) 3381: 339/* 340 * Avoid consecutive LD*'s to the same register since some mips 341 * implementations can't issue them in the same cycle. 342 * It's OK to load FIRST(N+1) before REST(N) because the two addresses 343 * are to the same unit (unless src is aligned, but it's not). 344 */ 345EXC( LDFIRST t0, FIRST(0)(src), l_exc) 346EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) 347 SUB len, len, 4*NBYTES 348EXC( LDREST t0, REST(0)(src), l_exc_copy) 349EXC( LDREST t1, REST(1)(src), l_exc_copy) 350EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) 351EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) 352EXC( LDREST t2, REST(2)(src), l_exc_copy) 353EXC( LDREST t3, REST(3)(src), l_exc_copy) 354 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed) 355 ADD src, src, 4*NBYTES 356#ifdef CONFIG_CPU_SB1 357 nop # improves slotting 358#endif 359 STORE t0, UNIT(0)(dst) 360 STORE t1, UNIT(1)(dst) 361 STORE t2, UNIT(2)(dst) 362 STORE t3, UNIT(3)(dst) 363 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed) 364 bne len, rem, 1b 365 ADD dst, dst, 4*NBYTES 366 367cleanup_src_unaligned: 368 beqz len, done 369 and rem, len, NBYTES-1 # rem = len % NBYTES 370 beq rem, len, copy_bytes 371 nop 3721: 373EXC( LDFIRST t0, FIRST(0)(src), l_exc) 374EXC( LDREST t0, REST(0)(src), l_exc_copy) 375 ADD src, src, NBYTES 376 SUB len, len, NBYTES 377 STORE t0, 0(dst) 378 bne len, rem, 1b 379 ADD dst, dst, NBYTES 380 381copy_bytes_checklen: 382 beqz len, done 383 nop 384copy_bytes: 385 /* 0 < len < NBYTES */ 386#define COPY_BYTE(N) \ 387EXC( lb t0, N(src), l_exc); \ 388 SUB len, len, 1; \ 389 beqz len, done; \ 390 sb t0, N(dst) 391 392 COPY_BYTE(0) 393 COPY_BYTE(1) 394#ifdef USE_DOUBLE 395 COPY_BYTE(2) 396 COPY_BYTE(3) 397 COPY_BYTE(4) 398 COPY_BYTE(5) 399#endif 400EXC( lb t0, NBYTES-2(src), l_exc) 401 SUB len, len, 1 402 jr ra 403 sb t0, NBYTES-2(dst) 404done: 405 jr ra 406 nop 407 END(__copy_user_inatomic) 408 409l_exc_copy: 410 /* 411 * Copy bytes from src until faulting load address (or until a 412 * lb faults) 413 * 414 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28) 415 * may be more than a byte beyond the last address. 416 * Hence, the lb below may get an exception. 417 * 418 * Assumes src < THREAD_BUADDR($28) 419 */ 420 LOAD t0, TI_TASK($28) 421 nop 422 LOAD t0, THREAD_BUADDR(t0) 4231: 424EXC( lb t1, 0(src), l_exc) 425 ADD src, src, 1 426 sb t1, 0(dst) # can't fault -- we're copy_from_user 427 bne src, t0, 1b 428 ADD dst, dst, 1 429l_exc: 430 LOAD t0, TI_TASK($28) 431 nop 432 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address 433 nop 434 SUB len, AT, t0 # len number of uncopied bytes 435 jr ra 436 nop 437