1/* 2 * This file is subject to the terms and conditions of the GNU General Public 3 * License. See the file "COPYING" in the main directory of this archive 4 * for more details. 5 * 6 * Unified implementation of memcpy, memmove and the __copy_user backend. 7 * 8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org) 9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. 10 * Copyright (C) 2002 Broadcom, Inc. 11 * memcpy/copy_user author: Mark Vandevoorde 12 * Copyright (C) 2007 Maciej W. Rozycki 13 * 14 * Mnemonic names for arguments to memcpy/__copy_user 15 */ 16 17/* 18 * Hack to resolve longstanding prefetch issue 19 * 20 * Prefetching may be fatal on some systems if we're prefetching beyond the 21 * end of memory on some systems. It's also a seriously bad idea on non 22 * dma-coherent systems. 23 */ 24#ifdef CONFIG_DMA_NONCOHERENT 25#undef CONFIG_CPU_HAS_PREFETCH 26#endif 27#ifdef CONFIG_MIPS_MALTA 28#undef CONFIG_CPU_HAS_PREFETCH 29#endif 30 31#include <asm/asm.h> 32#include <asm/asm-offsets.h> 33#include <asm/regdef.h> 34 35#define dst a0 36#define src a1 37#define len a2 38 39/* 40 * Spec 41 * 42 * memcpy copies len bytes from src to dst and sets v0 to dst. 43 * It assumes that 44 * - src and dst don't overlap 45 * - src is readable 46 * - dst is writable 47 * memcpy uses the standard calling convention 48 * 49 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to 50 * the number of uncopied bytes due to an exception caused by a read or write. 51 * __copy_user assumes that src and dst don't overlap, and that the call is 52 * implementing one of the following: 53 * copy_to_user 54 * - src is readable (no exceptions when reading src) 55 * copy_from_user 56 * - dst is writable (no exceptions when writing dst) 57 * __copy_user uses a non-standard calling convention; see 58 * include/asm-mips/uaccess.h 59 * 60 * When an exception happens on a load, the handler must 61 # ensure that all of the destination buffer is overwritten to prevent 62 * leaking information to user mode programs. 63 */ 64 65/* 66 * Implementation 67 */ 68 69/* 70 * The exception handler for loads requires that: 71 * 1- AT contain the address of the byte just past the end of the source 72 * of the copy, 73 * 2- src_entry <= src < AT, and 74 * 3- (dst - src) == (dst_entry - src_entry), 75 * The _entry suffix denotes values when __copy_user was called. 76 * 77 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user 78 * (2) is met by incrementing src by the number of bytes copied 79 * (3) is met by not doing loads between a pair of increments of dst and src 80 * 81 * The exception handlers for stores adjust len (if necessary) and return. 82 * These handlers do not need to overwrite any data. 83 * 84 * For __rmemcpy and memmove an exception is always a kernel bug, therefore 85 * they're not protected. 86 */ 87 88#define EXC(inst_reg,addr,handler) \ 899: inst_reg, addr; \ 90 .section __ex_table,"a"; \ 91 PTR 9b, handler; \ 92 .previous 93 94/* 95 * Only on the 64-bit kernel we can made use of 64-bit registers. 96 */ 97#ifdef CONFIG_64BIT 98#define USE_DOUBLE 99#endif 100 101#ifdef USE_DOUBLE 102 103#define LOAD ld 104#define LOADL ldl 105#define LOADR ldr 106#define STOREL sdl 107#define STORER sdr 108#define STORE sd 109#define ADD daddu 110#define SUB dsubu 111#define SRL dsrl 112#define SRA dsra 113#define SLL dsll 114#define SLLV dsllv 115#define SRLV dsrlv 116#define NBYTES 8 117#define LOG_NBYTES 3 118 119/* 120 * As we are sharing code base with the mips32 tree (which use the o32 ABI 121 * register definitions). We need to redefine the register definitions from 122 * the n64 ABI register naming to the o32 ABI register naming. 123 */ 124#undef t0 125#undef t1 126#undef t2 127#undef t3 128#define t0 $8 129#define t1 $9 130#define t2 $10 131#define t3 $11 132#define t4 $12 133#define t5 $13 134#define t6 $14 135#define t7 $15 136 137#else 138 139#define LOAD lw 140#define LOADL lwl 141#define LOADR lwr 142#define STOREL swl 143#define STORER swr 144#define STORE sw 145#define ADD addu 146#define SUB subu 147#define SRL srl 148#define SLL sll 149#define SRA sra 150#define SLLV sllv 151#define SRLV srlv 152#define NBYTES 4 153#define LOG_NBYTES 2 154 155#endif /* USE_DOUBLE */ 156 157#ifdef CONFIG_CPU_LITTLE_ENDIAN 158#define LDFIRST LOADR 159#define LDREST LOADL 160#define STFIRST STORER 161#define STREST STOREL 162#define SHIFT_DISCARD SLLV 163#else 164#define LDFIRST LOADL 165#define LDREST LOADR 166#define STFIRST STOREL 167#define STREST STORER 168#define SHIFT_DISCARD SRLV 169#endif 170 171#define FIRST(unit) ((unit)*NBYTES) 172#define REST(unit) (FIRST(unit)+NBYTES-1) 173#define UNIT(unit) FIRST(unit) 174 175#define ADDRMASK (NBYTES-1) 176 177 .text 178 .set noreorder 179#ifndef CONFIG_CPU_DADDI_WORKAROUNDS 180 .set noat 181#else 182 .set at=v1 183#endif 184 185/* 186 * A combined memcpy/__copy_user 187 * __copy_user sets len to 0 for success; else to an upper bound of 188 * the number of uncopied bytes. 189 * memcpy sets v0 to dst. 190 */ 191 .align 5 192LEAF(__copy_user_inatomic) 193 /* 194 * Note: dst & src may be unaligned, len may be 0 195 * Temps 196 */ 197#define rem t8 198 199 /* 200 * The "issue break"s below are very approximate. 201 * Issue delays for dcache fills will perturb the schedule, as will 202 * load queue full replay traps, etc. 203 * 204 * If len < NBYTES use byte operations. 205 */ 206 PREF( 0, 0(src) ) 207 PREF( 1, 0(dst) ) 208 sltu t2, len, NBYTES 209 and t1, dst, ADDRMASK 210 PREF( 0, 1*32(src) ) 211 PREF( 1, 1*32(dst) ) 212 bnez t2, .Lcopy_bytes_checklen 213 and t0, src, ADDRMASK 214 PREF( 0, 2*32(src) ) 215 PREF( 1, 2*32(dst) ) 216 bnez t1, .Ldst_unaligned 217 nop 218 bnez t0, .Lsrc_unaligned_dst_aligned 219 /* 220 * use delay slot for fall-through 221 * src and dst are aligned; need to compute rem 222 */ 223.Lboth_aligned: 224 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter 225 beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES 226 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) 227 PREF( 0, 3*32(src) ) 228 PREF( 1, 3*32(dst) ) 229 .align 4 2301: 231EXC( LOAD t0, UNIT(0)(src), .Ll_exc) 232EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy) 233EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy) 234EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy) 235 SUB len, len, 8*NBYTES 236EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy) 237EXC( LOAD t7, UNIT(5)(src), .Ll_exc_copy) 238 STORE t0, UNIT(0)(dst) 239 STORE t1, UNIT(1)(dst) 240EXC( LOAD t0, UNIT(6)(src), .Ll_exc_copy) 241EXC( LOAD t1, UNIT(7)(src), .Ll_exc_copy) 242 ADD src, src, 8*NBYTES 243 ADD dst, dst, 8*NBYTES 244 STORE t2, UNIT(-6)(dst) 245 STORE t3, UNIT(-5)(dst) 246 STORE t4, UNIT(-4)(dst) 247 STORE t7, UNIT(-3)(dst) 248 STORE t0, UNIT(-2)(dst) 249 STORE t1, UNIT(-1)(dst) 250 PREF( 0, 8*32(src) ) 251 PREF( 1, 8*32(dst) ) 252 bne len, rem, 1b 253 nop 254 255 /* 256 * len == rem == the number of bytes left to copy < 8*NBYTES 257 */ 258.Lcleanup_both_aligned: 259 beqz len, .Ldone 260 sltu t0, len, 4*NBYTES 261 bnez t0, .Lless_than_4units 262 and rem, len, (NBYTES-1) # rem = len % NBYTES 263 /* 264 * len >= 4*NBYTES 265 */ 266EXC( LOAD t0, UNIT(0)(src), .Ll_exc) 267EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy) 268EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy) 269EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy) 270 SUB len, len, 4*NBYTES 271 ADD src, src, 4*NBYTES 272 STORE t0, UNIT(0)(dst) 273 STORE t1, UNIT(1)(dst) 274 STORE t2, UNIT(2)(dst) 275 STORE t3, UNIT(3)(dst) 276 .set reorder /* DADDI_WAR */ 277 ADD dst, dst, 4*NBYTES 278 beqz len, .Ldone 279 .set noreorder 280.Lless_than_4units: 281 /* 282 * rem = len % NBYTES 283 */ 284 beq rem, len, .Lcopy_bytes 285 nop 2861: 287EXC( LOAD t0, 0(src), .Ll_exc) 288 ADD src, src, NBYTES 289 SUB len, len, NBYTES 290 STORE t0, 0(dst) 291 .set reorder /* DADDI_WAR */ 292 ADD dst, dst, NBYTES 293 bne rem, len, 1b 294 .set noreorder 295 296 /* 297 * src and dst are aligned, need to copy rem bytes (rem < NBYTES) 298 * A loop would do only a byte at a time with possible branch 299 * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE 300 * because can't assume read-access to dst. Instead, use 301 * STREST dst, which doesn't require read access to dst. 302 * 303 * This code should perform better than a simple loop on modern, 304 * wide-issue mips processors because the code has fewer branches and 305 * more instruction-level parallelism. 306 */ 307#define bits t2 308 beqz len, .Ldone 309 ADD t1, dst, len # t1 is just past last byte of dst 310 li bits, 8*NBYTES 311 SLL rem, len, 3 # rem = number of bits to keep 312EXC( LOAD t0, 0(src), .Ll_exc) 313 SUB bits, bits, rem # bits = number of bits to discard 314 SHIFT_DISCARD t0, t0, bits 315 STREST t0, -1(t1) 316 jr ra 317 move len, zero 318.Ldst_unaligned: 319 /* 320 * dst is unaligned 321 * t0 = src & ADDRMASK 322 * t1 = dst & ADDRMASK; T1 > 0 323 * len >= NBYTES 324 * 325 * Copy enough bytes to align dst 326 * Set match = (src and dst have same alignment) 327 */ 328#define match rem 329EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc) 330 ADD t2, zero, NBYTES 331EXC( LDREST t3, REST(0)(src), .Ll_exc_copy) 332 SUB t2, t2, t1 # t2 = number of bytes copied 333 xor match, t0, t1 334 STFIRST t3, FIRST(0)(dst) 335 beq len, t2, .Ldone 336 SUB len, len, t2 337 ADD dst, dst, t2 338 beqz match, .Lboth_aligned 339 ADD src, src, t2 340 341.Lsrc_unaligned_dst_aligned: 342 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter 343 PREF( 0, 3*32(src) ) 344 beqz t0, .Lcleanup_src_unaligned 345 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES 346 PREF( 1, 3*32(dst) ) 3471: 348/* 349 * Avoid consecutive LD*'s to the same register since some mips 350 * implementations can't issue them in the same cycle. 351 * It's OK to load FIRST(N+1) before REST(N) because the two addresses 352 * are to the same unit (unless src is aligned, but it's not). 353 */ 354EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc) 355EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy) 356 SUB len, len, 4*NBYTES 357EXC( LDREST t0, REST(0)(src), .Ll_exc_copy) 358EXC( LDREST t1, REST(1)(src), .Ll_exc_copy) 359EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy) 360EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy) 361EXC( LDREST t2, REST(2)(src), .Ll_exc_copy) 362EXC( LDREST t3, REST(3)(src), .Ll_exc_copy) 363 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed) 364 ADD src, src, 4*NBYTES 365#ifdef CONFIG_CPU_SB1 366 nop # improves slotting 367#endif 368 STORE t0, UNIT(0)(dst) 369 STORE t1, UNIT(1)(dst) 370 STORE t2, UNIT(2)(dst) 371 STORE t3, UNIT(3)(dst) 372 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed) 373 .set reorder /* DADDI_WAR */ 374 ADD dst, dst, 4*NBYTES 375 bne len, rem, 1b 376 .set noreorder 377 378.Lcleanup_src_unaligned: 379 beqz len, .Ldone 380 and rem, len, NBYTES-1 # rem = len % NBYTES 381 beq rem, len, .Lcopy_bytes 382 nop 3831: 384EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc) 385EXC( LDREST t0, REST(0)(src), .Ll_exc_copy) 386 ADD src, src, NBYTES 387 SUB len, len, NBYTES 388 STORE t0, 0(dst) 389 .set reorder /* DADDI_WAR */ 390 ADD dst, dst, NBYTES 391 bne len, rem, 1b 392 .set noreorder 393 394.Lcopy_bytes_checklen: 395 beqz len, .Ldone 396 nop 397.Lcopy_bytes: 398 /* 0 < len < NBYTES */ 399#define COPY_BYTE(N) \ 400EXC( lb t0, N(src), .Ll_exc); \ 401 SUB len, len, 1; \ 402 beqz len, .Ldone; \ 403 sb t0, N(dst) 404 405 COPY_BYTE(0) 406 COPY_BYTE(1) 407#ifdef USE_DOUBLE 408 COPY_BYTE(2) 409 COPY_BYTE(3) 410 COPY_BYTE(4) 411 COPY_BYTE(5) 412#endif 413EXC( lb t0, NBYTES-2(src), .Ll_exc) 414 SUB len, len, 1 415 jr ra 416 sb t0, NBYTES-2(dst) 417.Ldone: 418 jr ra 419 nop 420 END(__copy_user_inatomic) 421 422.Ll_exc_copy: 423 /* 424 * Copy bytes from src until faulting load address (or until a 425 * lb faults) 426 * 427 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28) 428 * may be more than a byte beyond the last address. 429 * Hence, the lb below may get an exception. 430 * 431 * Assumes src < THREAD_BUADDR($28) 432 */ 433 LOAD t0, TI_TASK($28) 434 nop 435 LOAD t0, THREAD_BUADDR(t0) 4361: 437EXC( lb t1, 0(src), .Ll_exc) 438 ADD src, src, 1 439 sb t1, 0(dst) # can't fault -- we're copy_from_user 440 .set reorder /* DADDI_WAR */ 441 ADD dst, dst, 1 442 bne src, t0, 1b 443 .set noreorder 444.Ll_exc: 445 LOAD t0, TI_TASK($28) 446 nop 447 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address 448 nop 449 SUB len, AT, t0 # len number of uncopied bytes 450 jr ra 451 nop 452