1/* NGmemcpy.S: Niagara optimized memcpy. 2 * 3 * Copyright (C) 2006 David S. Miller (davem@davemloft.net) 4 */ 5 6#ifdef __KERNEL__ 7#include <asm/asi.h> 8#include <asm/thread_info.h> 9#define GLOBAL_SPARE %g7 10#define RESTORE_ASI(TMP) \ 11 ldub [%g6 + TI_CURRENT_DS], TMP; \ 12 wr TMP, 0x0, %asi; 13#else 14#define GLOBAL_SPARE %g5 15#define RESTORE_ASI(TMP) \ 16 wr %g0, ASI_PNF, %asi 17#endif 18 19#ifndef STORE_ASI 20#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 21#endif 22 23#ifndef EX_LD 24#define EX_LD(x) x 25#endif 26 27#ifndef EX_ST 28#define EX_ST(x) x 29#endif 30 31#ifndef EX_RETVAL 32#define EX_RETVAL(x) x 33#endif 34 35#ifndef LOAD 36#ifndef MEMCPY_DEBUG 37#define LOAD(type,addr,dest) type [addr], dest 38#else 39#define LOAD(type,addr,dest) type##a [addr] 0x80, dest 40#endif 41#endif 42 43#ifndef LOAD_TWIN 44#define LOAD_TWIN(addr_reg,dest0,dest1) \ 45 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 46#endif 47 48#ifndef STORE 49#define STORE(type,src,addr) type src, [addr] 50#endif 51 52#ifndef STORE_INIT 53#define STORE_INIT(src,addr) stxa src, [addr] %asi 54#endif 55 56#ifndef FUNC_NAME 57#define FUNC_NAME NGmemcpy 58#endif 59 60#ifndef PREAMBLE 61#define PREAMBLE 62#endif 63 64#ifndef XCC 65#define XCC xcc 66#endif 67 68 .register %g2,#scratch 69 .register %g3,#scratch 70 71 .text 72 .align 64 73 74 .globl FUNC_NAME 75 .type FUNC_NAME,#function 76FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 77 srlx %o2, 31, %g2 78 cmp %g2, 0 79 tne %xcc, 5 80 PREAMBLE 81 mov %o0, GLOBAL_SPARE 82 cmp %o2, 0 83 be,pn %XCC, 85f 84 or %o0, %o1, %o3 85 cmp %o2, 16 86 blu,a,pn %XCC, 80f 87 or %o3, %o2, %o3 88 89 /* 2 blocks (128 bytes) is the minimum we can do the block 90 * copy with. We need to ensure that we'll iterate at least 91 * once in the block copy loop. At worst we'll need to align 92 * the destination to a 64-byte boundary which can chew up 93 * to (64 - 1) bytes from the length before we perform the 94 * block copy loop. 95 */ 96 cmp %o2, (2 * 64) 97 blu,pt %XCC, 70f 98 andcc %o3, 0x7, %g0 99 100 /* %o0: dst 101 * %o1: src 102 * %o2: len (known to be >= 128) 103 * 104 * The block copy loops will use %o4/%o5,%g2/%g3 as 105 * temporaries while copying the data. 106 */ 107 108 LOAD(prefetch, %o1, #one_read) 109 wr %g0, STORE_ASI, %asi 110 111 /* Align destination on 64-byte boundary. */ 112 andcc %o0, (64 - 1), %o4 113 be,pt %XCC, 2f 114 sub %o4, 64, %o4 115 sub %g0, %o4, %o4 ! bytes to align dst 116 sub %o2, %o4, %o2 1171: subcc %o4, 1, %o4 118 EX_LD(LOAD(ldub, %o1, %g1)) 119 EX_ST(STORE(stb, %g1, %o0)) 120 add %o1, 1, %o1 121 bne,pt %XCC, 1b 122 add %o0, 1, %o0 123 124 /* If the source is on a 16-byte boundary we can do 125 * the direct block copy loop. If it is 8-byte aligned 126 * we can do the 16-byte loads offset by -8 bytes and the 127 * init stores offset by one register. 128 * 129 * If the source is not even 8-byte aligned, we need to do 130 * shifting and masking (basically integer faligndata). 131 * 132 * The careful bit with init stores is that if we store 133 * to any part of the cache line we have to store the whole 134 * cacheline else we can end up with corrupt L2 cache line 135 * contents. Since the loop works on 64-bytes of 64-byte 136 * aligned store data at a time, this is easy to ensure. 137 */ 1382: 139 andcc %o1, (16 - 1), %o4 140 andn %o2, (64 - 1), %g1 ! block copy loop iterator 141 sub %o2, %g1, %o2 ! final sub-block copy bytes 142 be,pt %XCC, 50f 143 cmp %o4, 8 144 be,a,pt %XCC, 10f 145 sub %o1, 0x8, %o1 146 147 /* Neither 8-byte nor 16-byte aligned, shift and mask. */ 148 mov %g1, %o4 149 and %o1, 0x7, %g1 150 sll %g1, 3, %g1 151 mov 64, %o3 152 andn %o1, 0x7, %o1 153 EX_LD(LOAD(ldx, %o1, %g2)) 154 sub %o3, %g1, %o3 155 sllx %g2, %g1, %g2 156 157#define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\ 158 EX_LD(LOAD(ldx, SRC, TMP1)); \ 159 srlx TMP1, PRE_SHIFT, TMP2; \ 160 or TMP2, PRE_VAL, TMP2; \ 161 EX_ST(STORE_INIT(TMP2, DST)); \ 162 sllx TMP1, POST_SHIFT, PRE_VAL; 163 1641: add %o1, 0x8, %o1 165 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00) 166 add %o1, 0x8, %o1 167 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08) 168 add %o1, 0x8, %o1 169 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10) 170 add %o1, 0x8, %o1 171 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18) 172 add %o1, 32, %o1 173 LOAD(prefetch, %o1, #one_read) 174 sub %o1, 32 - 8, %o1 175 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20) 176 add %o1, 8, %o1 177 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28) 178 add %o1, 8, %o1 179 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30) 180 add %o1, 8, %o1 181 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38) 182 subcc %o4, 64, %o4 183 bne,pt %XCC, 1b 184 add %o0, 64, %o0 185 186#undef SWIVEL_ONE_DWORD 187 188 srl %g1, 3, %g1 189 ba,pt %XCC, 60f 190 add %o1, %g1, %o1 191 19210: /* Destination is 64-byte aligned, source was only 8-byte 193 * aligned but it has been subtracted by 8 and we perform 194 * one twin load ahead, then add 8 back into source when 195 * we finish the loop. 196 */ 197 EX_LD(LOAD_TWIN(%o1, %o4, %o5)) 1981: add %o1, 16, %o1 199 EX_LD(LOAD_TWIN(%o1, %g2, %g3)) 200 add %o1, 16 + 32, %o1 201 LOAD(prefetch, %o1, #one_read) 202 sub %o1, 32, %o1 203 EX_ST(STORE_INIT(%o5, %o0 + 0x00)) ! initializes cache line 204 EX_ST(STORE_INIT(%g2, %o0 + 0x08)) 205 EX_LD(LOAD_TWIN(%o1, %o4, %o5)) 206 add %o1, 16, %o1 207 EX_ST(STORE_INIT(%g3, %o0 + 0x10)) 208 EX_ST(STORE_INIT(%o4, %o0 + 0x18)) 209 EX_LD(LOAD_TWIN(%o1, %g2, %g3)) 210 add %o1, 16, %o1 211 EX_ST(STORE_INIT(%o5, %o0 + 0x20)) 212 EX_ST(STORE_INIT(%g2, %o0 + 0x28)) 213 EX_LD(LOAD_TWIN(%o1, %o4, %o5)) 214 EX_ST(STORE_INIT(%g3, %o0 + 0x30)) 215 EX_ST(STORE_INIT(%o4, %o0 + 0x38)) 216 subcc %g1, 64, %g1 217 bne,pt %XCC, 1b 218 add %o0, 64, %o0 219 220 ba,pt %XCC, 60f 221 add %o1, 0x8, %o1 222 22350: /* Destination is 64-byte aligned, and source is 16-byte 224 * aligned. 225 */ 2261: EX_LD(LOAD_TWIN(%o1, %o4, %o5)) 227 add %o1, 16, %o1 228 EX_LD(LOAD_TWIN(%o1, %g2, %g3)) 229 add %o1, 16 + 32, %o1 230 LOAD(prefetch, %o1, #one_read) 231 sub %o1, 32, %o1 232 EX_ST(STORE_INIT(%o4, %o0 + 0x00)) ! initializes cache line 233 EX_ST(STORE_INIT(%o5, %o0 + 0x08)) 234 EX_LD(LOAD_TWIN(%o1, %o4, %o5)) 235 add %o1, 16, %o1 236 EX_ST(STORE_INIT(%g2, %o0 + 0x10)) 237 EX_ST(STORE_INIT(%g3, %o0 + 0x18)) 238 EX_LD(LOAD_TWIN(%o1, %g2, %g3)) 239 add %o1, 16, %o1 240 EX_ST(STORE_INIT(%o4, %o0 + 0x20)) 241 EX_ST(STORE_INIT(%o5, %o0 + 0x28)) 242 EX_ST(STORE_INIT(%g2, %o0 + 0x30)) 243 EX_ST(STORE_INIT(%g3, %o0 + 0x38)) 244 subcc %g1, 64, %g1 245 bne,pt %XCC, 1b 246 add %o0, 64, %o0 247 /* fall through */ 248 24960: 250 membar #Sync 251 252 /* %o2 contains any final bytes still needed to be copied 253 * over. If anything is left, we copy it one byte at a time. 254 */ 255 RESTORE_ASI(%o3) 256 brz,pt %o2, 85f 257 sub %o0, %o1, %o3 258 ba,a,pt %XCC, 90f 259 260 .align 64 26170: /* 16 < len <= 64 */ 262 bne,pn %XCC, 75f 263 sub %o0, %o1, %o3 264 26572: 266 andn %o2, 0xf, %o4 267 and %o2, 0xf, %o2 2681: subcc %o4, 0x10, %o4 269 EX_LD(LOAD(ldx, %o1, %o5)) 270 add %o1, 0x08, %o1 271 EX_LD(LOAD(ldx, %o1, %g1)) 272 sub %o1, 0x08, %o1 273 EX_ST(STORE(stx, %o5, %o1 + %o3)) 274 add %o1, 0x8, %o1 275 EX_ST(STORE(stx, %g1, %o1 + %o3)) 276 bgu,pt %XCC, 1b 277 add %o1, 0x8, %o1 27873: andcc %o2, 0x8, %g0 279 be,pt %XCC, 1f 280 nop 281 sub %o2, 0x8, %o2 282 EX_LD(LOAD(ldx, %o1, %o5)) 283 EX_ST(STORE(stx, %o5, %o1 + %o3)) 284 add %o1, 0x8, %o1 2851: andcc %o2, 0x4, %g0 286 be,pt %XCC, 1f 287 nop 288 sub %o2, 0x4, %o2 289 EX_LD(LOAD(lduw, %o1, %o5)) 290 EX_ST(STORE(stw, %o5, %o1 + %o3)) 291 add %o1, 0x4, %o1 2921: cmp %o2, 0 293 be,pt %XCC, 85f 294 nop 295 ba,pt %xcc, 90f 296 nop 297 29875: 299 andcc %o0, 0x7, %g1 300 sub %g1, 0x8, %g1 301 be,pn %icc, 2f 302 sub %g0, %g1, %g1 303 sub %o2, %g1, %o2 304 3051: subcc %g1, 1, %g1 306 EX_LD(LOAD(ldub, %o1, %o5)) 307 EX_ST(STORE(stb, %o5, %o1 + %o3)) 308 bgu,pt %icc, 1b 309 add %o1, 1, %o1 310 3112: add %o1, %o3, %o0 312 andcc %o1, 0x7, %g1 313 bne,pt %icc, 8f 314 sll %g1, 3, %g1 315 316 cmp %o2, 16 317 bgeu,pt %icc, 72b 318 nop 319 ba,a,pt %xcc, 73b 320 3218: mov 64, %o3 322 andn %o1, 0x7, %o1 323 EX_LD(LOAD(ldx, %o1, %g2)) 324 sub %o3, %g1, %o3 325 andn %o2, 0x7, %o4 326 sllx %g2, %g1, %g2 3271: add %o1, 0x8, %o1 328 EX_LD(LOAD(ldx, %o1, %g3)) 329 subcc %o4, 0x8, %o4 330 srlx %g3, %o3, %o5 331 or %o5, %g2, %o5 332 EX_ST(STORE(stx, %o5, %o0)) 333 add %o0, 0x8, %o0 334 bgu,pt %icc, 1b 335 sllx %g3, %g1, %g2 336 337 srl %g1, 3, %g1 338 andcc %o2, 0x7, %o2 339 be,pn %icc, 85f 340 add %o1, %g1, %o1 341 ba,pt %xcc, 90f 342 sub %o0, %o1, %o3 343 344 .align 64 34580: /* 0 < len <= 16 */ 346 andcc %o3, 0x3, %g0 347 bne,pn %XCC, 90f 348 sub %o0, %o1, %o3 349 3501: 351 subcc %o2, 4, %o2 352 EX_LD(LOAD(lduw, %o1, %g1)) 353 EX_ST(STORE(stw, %g1, %o1 + %o3)) 354 bgu,pt %XCC, 1b 355 add %o1, 4, %o1 356 35785: retl 358 mov EX_RETVAL(GLOBAL_SPARE), %o0 359 360 .align 32 36190: 362 subcc %o2, 1, %o2 363 EX_LD(LOAD(ldub, %o1, %g1)) 364 EX_ST(STORE(stb, %g1, %o1 + %o3)) 365 bgu,pt %XCC, 90b 366 add %o1, 1, %o1 367 retl 368 mov EX_RETVAL(GLOBAL_SPARE), %o0 369 370 .size FUNC_NAME, .-FUNC_NAME 371