1/* 2 * Optimized memory copy routines. 3 * 4 * Copyright (C) 2004 Randolph Chung <tausq@debian.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2, or (at your option) 9 * any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 * 20 * Portions derived from the GNU C Library 21 * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc. 22 * 23 * Several strategies are tried to try to get the best performance for various 24 * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using 25 * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using 26 * general registers. Unaligned copies are handled either by aligning the 27 * destination and then using shift-and-write method, or in a few cases by 28 * falling back to a byte-at-a-time copy. 29 * 30 * I chose to implement this in C because it is easier to maintain and debug, 31 * and in my experiments it appears that the C code generated by gcc (3.3/3.4 32 * at the time of writing) is fairly optimal. Unfortunately some of the 33 * semantics of the copy routine (exception handling) is difficult to express 34 * in C, so we have to play some tricks to get it to work. 35 * 36 * All the loads and stores are done via explicit asm() code in order to use 37 * the right space registers. 38 * 39 * Testing with various alignments and buffer sizes shows that this code is 40 * often >10x faster than a simple byte-at-a-time copy, even for strangely 41 * aligned operands. It is interesting to note that the glibc version 42 * of memcpy (written in C) is actually quite fast already. This routine is 43 * able to beat it by 30-40% for aligned copies because of the loop unrolling, 44 * but in some cases the glibc version is still slightly faster. This lends 45 * more credibility that gcc can generate very good code as long as we are 46 * careful. 47 * 48 * TODO: 49 * - cache prefetching needs more experimentation to get optimal settings 50 * - try not to use the post-increment address modifiers; they create additional 51 * interlocks 52 * - replace byte-copy loops with stybs sequences 53 */ 54 55#ifdef __KERNEL__ 56#include <linux/module.h> 57#include <linux/compiler.h> 58#include <asm/uaccess.h> 59#define s_space "%%sr1" 60#define d_space "%%sr2" 61#else 62#include "memcpy.h" 63#define s_space "%%sr0" 64#define d_space "%%sr0" 65#define pa_memcpy new2_copy 66#endif 67 68DECLARE_PER_CPU(struct exception_data, exception_data); 69 70#define preserve_branch(label) do { \ 71 volatile int dummy; \ 72 /* The following branch is never taken, it's just here to */ \ 73 /* prevent gcc from optimizing away our exception code. */ \ 74 if (unlikely(dummy != dummy)) \ 75 goto label; \ 76} while (0) 77 78#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3)) 79#define get_kernel_space() (0) 80 81#define MERGE(w0, sh_1, w1, sh_2) ({ \ 82 unsigned int _r; \ 83 asm volatile ( \ 84 "mtsar %3\n" \ 85 "shrpw %1, %2, %%sar, %0\n" \ 86 : "=r"(_r) \ 87 : "r"(w0), "r"(w1), "r"(sh_2) \ 88 ); \ 89 _r; \ 90}) 91#define THRESHOLD 16 92 93#ifdef DEBUG_MEMCPY 94#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0) 95#else 96#define DPRINTF(fmt, args...) 97#endif 98 99#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ 100 __asm__ __volatile__ ( \ 101 "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \ 102 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ 103 : _tt(_t), "+r"(_a) \ 104 : \ 105 : "r8") 106 107#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ 108 __asm__ __volatile__ ( \ 109 "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \ 110 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ 111 : "+r"(_a) \ 112 : _tt(_t) \ 113 : "r8") 114 115#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e) 116#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e) 117#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e) 118#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e) 119#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e) 120#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e) 121 122#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \ 123 __asm__ __volatile__ ( \ 124 "1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t" \ 125 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ 126 : _tt(_t) \ 127 : "r"(_a) \ 128 : "r8") 129 130#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \ 131 __asm__ __volatile__ ( \ 132 "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t" \ 133 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ 134 : \ 135 : _tt(_t), "r"(_a) \ 136 : "r8") 137 138#define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e) 139#define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e) 140 141#ifdef CONFIG_PREFETCH 142static inline void prefetch_src(const void *addr) 143{ 144 __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr)); 145} 146 147static inline void prefetch_dst(const void *addr) 148{ 149 __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr)); 150} 151#else 152#define prefetch_src(addr) do { } while(0) 153#define prefetch_dst(addr) do { } while(0) 154#endif 155 156/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words 157 * per loop. This code is derived from glibc. 158 */ 159static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len) 160{ 161 /* gcc complains that a2 and a3 may be uninitialized, but actually 162 * they cannot be. Initialize a2/a3 to shut gcc up. 163 */ 164 register unsigned int a0, a1, a2 = 0, a3 = 0; 165 int sh_1, sh_2; 166 struct exception_data *d; 167 168 /* prefetch_src((const void *)src); */ 169 170 /* Calculate how to shift a word read at the memory operation 171 aligned srcp to make it aligned for copy. */ 172 sh_1 = 8 * (src % sizeof(unsigned int)); 173 sh_2 = 8 * sizeof(unsigned int) - sh_1; 174 175 /* Make src aligned by rounding it down. */ 176 src &= -sizeof(unsigned int); 177 178 switch (len % 4) 179 { 180 case 2: 181 /* a1 = ((unsigned int *) src)[0]; 182 a2 = ((unsigned int *) src)[1]; */ 183 ldw(s_space, 0, src, a1, cda_ldw_exc); 184 ldw(s_space, 4, src, a2, cda_ldw_exc); 185 src -= 1 * sizeof(unsigned int); 186 dst -= 3 * sizeof(unsigned int); 187 len += 2; 188 goto do1; 189 case 3: 190 /* a0 = ((unsigned int *) src)[0]; 191 a1 = ((unsigned int *) src)[1]; */ 192 ldw(s_space, 0, src, a0, cda_ldw_exc); 193 ldw(s_space, 4, src, a1, cda_ldw_exc); 194 src -= 0 * sizeof(unsigned int); 195 dst -= 2 * sizeof(unsigned int); 196 len += 1; 197 goto do2; 198 case 0: 199 if (len == 0) 200 return 0; 201 /* a3 = ((unsigned int *) src)[0]; 202 a0 = ((unsigned int *) src)[1]; */ 203 ldw(s_space, 0, src, a3, cda_ldw_exc); 204 ldw(s_space, 4, src, a0, cda_ldw_exc); 205 src -=-1 * sizeof(unsigned int); 206 dst -= 1 * sizeof(unsigned int); 207 len += 0; 208 goto do3; 209 case 1: 210 /* a2 = ((unsigned int *) src)[0]; 211 a3 = ((unsigned int *) src)[1]; */ 212 ldw(s_space, 0, src, a2, cda_ldw_exc); 213 ldw(s_space, 4, src, a3, cda_ldw_exc); 214 src -=-2 * sizeof(unsigned int); 215 dst -= 0 * sizeof(unsigned int); 216 len -= 1; 217 if (len == 0) 218 goto do0; 219 goto do4; /* No-op. */ 220 } 221 222 do 223 { 224 /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */ 225do4: 226 /* a0 = ((unsigned int *) src)[0]; */ 227 ldw(s_space, 0, src, a0, cda_ldw_exc); 228 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ 229 stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc); 230do3: 231 /* a1 = ((unsigned int *) src)[1]; */ 232 ldw(s_space, 4, src, a1, cda_ldw_exc); 233 /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */ 234 stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc); 235do2: 236 /* a2 = ((unsigned int *) src)[2]; */ 237 ldw(s_space, 8, src, a2, cda_ldw_exc); 238 /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */ 239 stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc); 240do1: 241 /* a3 = ((unsigned int *) src)[3]; */ 242 ldw(s_space, 12, src, a3, cda_ldw_exc); 243 /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */ 244 stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc); 245 246 src += 4 * sizeof(unsigned int); 247 dst += 4 * sizeof(unsigned int); 248 len -= 4; 249 } 250 while (len != 0); 251 252do0: 253 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ 254 stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc); 255 256 preserve_branch(handle_load_error); 257 preserve_branch(handle_store_error); 258 259 return 0; 260 261handle_load_error: 262 __asm__ __volatile__ ("cda_ldw_exc:\n"); 263 d = &__get_cpu_var(exception_data); 264 DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n", 265 o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src); 266 return o_len * 4 - d->fault_addr + o_src; 267 268handle_store_error: 269 __asm__ __volatile__ ("cda_stw_exc:\n"); 270 d = &__get_cpu_var(exception_data); 271 DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n", 272 o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst); 273 return o_len * 4 - d->fault_addr + o_dst; 274} 275 276 277/* Returns 0 for success, otherwise, returns number of bytes not transferred. */ 278static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len) 279{ 280 register unsigned long src, dst, t1, t2, t3; 281 register unsigned char *pcs, *pcd; 282 register unsigned int *pws, *pwd; 283 register double *pds, *pdd; 284 unsigned long ret = 0; 285 unsigned long o_dst, o_src, o_len; 286 struct exception_data *d; 287 288 src = (unsigned long)srcp; 289 dst = (unsigned long)dstp; 290 pcs = (unsigned char *)srcp; 291 pcd = (unsigned char *)dstp; 292 293 o_dst = dst; o_src = src; o_len = len; 294 295 /* prefetch_src((const void *)srcp); */ 296 297 if (len < THRESHOLD) 298 goto byte_copy; 299 300 /* Check alignment */ 301 t1 = (src ^ dst); 302 if (unlikely(t1 & (sizeof(double)-1))) 303 goto unaligned_copy; 304 305 /* src and dst have same alignment. */ 306 307 /* Copy bytes till we are double-aligned. */ 308 t2 = src & (sizeof(double) - 1); 309 if (unlikely(t2 != 0)) { 310 t2 = sizeof(double) - t2; 311 while (t2 && len) { 312 /* *pcd++ = *pcs++; */ 313 ldbma(s_space, pcs, t3, pmc_load_exc); 314 len--; 315 stbma(d_space, t3, pcd, pmc_store_exc); 316 t2--; 317 } 318 } 319 320 pds = (double *)pcs; 321 pdd = (double *)pcd; 322 323 324 pws = (unsigned int *)pds; 325 pwd = (unsigned int *)pdd; 326 327word_copy: 328 while (len >= 8*sizeof(unsigned int)) { 329 register unsigned int r1,r2,r3,r4,r5,r6,r7,r8; 330 /* prefetch_src((char *)pws + L1_CACHE_BYTES); */ 331 ldwma(s_space, pws, r1, pmc_load_exc); 332 ldwma(s_space, pws, r2, pmc_load_exc); 333 ldwma(s_space, pws, r3, pmc_load_exc); 334 ldwma(s_space, pws, r4, pmc_load_exc); 335 stwma(d_space, r1, pwd, pmc_store_exc); 336 stwma(d_space, r2, pwd, pmc_store_exc); 337 stwma(d_space, r3, pwd, pmc_store_exc); 338 stwma(d_space, r4, pwd, pmc_store_exc); 339 340 ldwma(s_space, pws, r5, pmc_load_exc); 341 ldwma(s_space, pws, r6, pmc_load_exc); 342 ldwma(s_space, pws, r7, pmc_load_exc); 343 ldwma(s_space, pws, r8, pmc_load_exc); 344 stwma(d_space, r5, pwd, pmc_store_exc); 345 stwma(d_space, r6, pwd, pmc_store_exc); 346 stwma(d_space, r7, pwd, pmc_store_exc); 347 stwma(d_space, r8, pwd, pmc_store_exc); 348 len -= 8*sizeof(unsigned int); 349 } 350 351 while (len >= 4*sizeof(unsigned int)) { 352 register unsigned int r1,r2,r3,r4; 353 ldwma(s_space, pws, r1, pmc_load_exc); 354 ldwma(s_space, pws, r2, pmc_load_exc); 355 ldwma(s_space, pws, r3, pmc_load_exc); 356 ldwma(s_space, pws, r4, pmc_load_exc); 357 stwma(d_space, r1, pwd, pmc_store_exc); 358 stwma(d_space, r2, pwd, pmc_store_exc); 359 stwma(d_space, r3, pwd, pmc_store_exc); 360 stwma(d_space, r4, pwd, pmc_store_exc); 361 len -= 4*sizeof(unsigned int); 362 } 363 364 pcs = (unsigned char *)pws; 365 pcd = (unsigned char *)pwd; 366 367byte_copy: 368 while (len) { 369 /* *pcd++ = *pcs++; */ 370 ldbma(s_space, pcs, t3, pmc_load_exc); 371 stbma(d_space, t3, pcd, pmc_store_exc); 372 len--; 373 } 374 375 return 0; 376 377unaligned_copy: 378 /* possibly we are aligned on a word, but not on a double... */ 379 if (likely((t1 & (sizeof(unsigned int)-1)) == 0)) { 380 t2 = src & (sizeof(unsigned int) - 1); 381 382 if (unlikely(t2 != 0)) { 383 t2 = sizeof(unsigned int) - t2; 384 while (t2) { 385 /* *pcd++ = *pcs++; */ 386 ldbma(s_space, pcs, t3, pmc_load_exc); 387 stbma(d_space, t3, pcd, pmc_store_exc); 388 len--; 389 t2--; 390 } 391 } 392 393 pws = (unsigned int *)pcs; 394 pwd = (unsigned int *)pcd; 395 goto word_copy; 396 } 397 398 /* Align the destination. */ 399 if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) { 400 t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1)); 401 while (t2) { 402 /* *pcd++ = *pcs++; */ 403 ldbma(s_space, pcs, t3, pmc_load_exc); 404 stbma(d_space, t3, pcd, pmc_store_exc); 405 len--; 406 t2--; 407 } 408 dst = (unsigned long)pcd; 409 src = (unsigned long)pcs; 410 } 411 412 ret = copy_dstaligned(dst, src, len / sizeof(unsigned int), 413 o_dst, o_src, o_len); 414 if (ret) 415 return ret; 416 417 pcs += (len & -sizeof(unsigned int)); 418 pcd += (len & -sizeof(unsigned int)); 419 len %= sizeof(unsigned int); 420 421 preserve_branch(handle_load_error); 422 preserve_branch(handle_store_error); 423 424 goto byte_copy; 425 426handle_load_error: 427 __asm__ __volatile__ ("pmc_load_exc:\n"); 428 d = &__get_cpu_var(exception_data); 429 DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n", 430 o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src); 431 return o_len - d->fault_addr + o_src; 432 433handle_store_error: 434 __asm__ __volatile__ ("pmc_store_exc:\n"); 435 d = &__get_cpu_var(exception_data); 436 DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n", 437 o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst); 438 return o_len - d->fault_addr + o_dst; 439} 440 441#ifdef __KERNEL__ 442unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len) 443{ 444 mtsp(get_kernel_space(), 1); 445 mtsp(get_user_space(), 2); 446 return pa_memcpy((void __force *)dst, src, len); 447} 448 449EXPORT_SYMBOL(__copy_from_user); 450unsigned long __copy_from_user(void *dst, const void __user *src, unsigned long len) 451{ 452 mtsp(get_user_space(), 1); 453 mtsp(get_kernel_space(), 2); 454 return pa_memcpy(dst, (void __force *)src, len); 455} 456 457unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len) 458{ 459 mtsp(get_user_space(), 1); 460 mtsp(get_user_space(), 2); 461 return pa_memcpy((void __force *)dst, (void __force *)src, len); 462} 463 464 465void * memcpy(void * dst,const void *src, size_t count) 466{ 467 mtsp(get_kernel_space(), 1); 468 mtsp(get_kernel_space(), 2); 469 pa_memcpy(dst, src, count); 470 return dst; 471} 472 473EXPORT_SYMBOL(copy_to_user); 474EXPORT_SYMBOL(copy_from_user); 475EXPORT_SYMBOL(copy_in_user); 476EXPORT_SYMBOL(memcpy); 477#endif 478