1/* $Id: VIScsumcopy.S,v 1.1.1.1 2008/10/15 03:26:19 james26_jang Exp $ 2 * VIScsumcopy.S: High bandwidth IP checksumming with simultaneous 3 * copying utilizing the UltraSparc Visual Instruction Set. 4 * 5 * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) 6 * 7 * Based on older sparc32/sparc64 checksum.S, which is: 8 * 9 * Copyright(C) 1995 Linus Torvalds 10 * Copyright(C) 1995 Miguel de Icaza 11 * Copyright(C) 1996,1997 David S. Miller 12 * derived from: 13 * Linux/Alpha checksum c-code 14 * Linux/ix86 inline checksum assembly 15 * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) 16 * David Mosberger-Tang for optimized reference c-code 17 * BSD4.4 portable checksum routine 18 */ 19 20#ifdef __sparc_v9__ 21#define STACKOFF 0x7ff+128 22#else 23#define STACKOFF 64 24#endif 25 26#ifdef __KERNEL__ 27#include <asm/head.h> 28#include <asm/asi.h> 29#include <asm/page.h> 30#include <asm/visasm.h> 31#define ASI_BLK_XOR 0 32#define ASI_BLK_XOR1 (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P) 33#define ASI_BLK_OR (ASI_BLK_P & ~ASI_P) 34#else 35#define ASI_P 0x80 36#define ASI_BLK_P 0xf0 37#define FRPS_FEF 0x04 38#define FPRS_DU 0x02 39#define FPRS_DL 0x01 40#define ASI_BLK_XOR (ASI_BLK_P ^ ASI_P) 41#endif 42 43#define src o0 44#define dst o1 45#define len o2 46#define sum o3 47#define x1 g1 48#define x2 g2 49#define x3 o4 50#define x4 g4 51#define x5 g5 52#define x6 g7 53#define x7 g3 54#define x8 o5 55 56/* Dobrou noc, SunSoft engineers. Spete sladce. 57 * This has a couple of tricks in and those 58 * tricks are UltraLinux trade secrets :)) 59 * Once AGAIN, the SunSoft engineers are caught 60 * asleep at the keyboard :)). 61 * The main loop does about 20 superscalar cycles 62 * per 64bytes checksummed/copied. 63 */ 64 65#define LDBLK(O0) \ 66 ldda [%src] %asi, %O0 /* Load Group */ 67 68#define STBLK \ 69 stda %f48, [%dst] ASI_BLK_P /* Store */ 70 71#define ST(fx,off) \ 72 std %fx, [%dst + off] /* Store */ 73 74#define SYNC \ 75 membar #Sync 76 77 78#define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...) \ 79 LOAD /* Load (Group) */; \ 80 faligndata %A14, %F0, %A14 /* FPA Group */; \ 81 inc %x5 /* IEU0 */; \ 82 STORE1 /* Store (optional) */; \ 83 faligndata %F0, %F2, %A0 /* FPA Group */; \ 84 srl %x5, 1, %x5 /* IEU0 */; \ 85 add %sum, %x4, %sum /* IEU1 */; \ 86 fpadd32 %F0, %f0, %F0 /* FPA Group */; \ 87 inc %x6 /* IEU0 */; \ 88 STORE2 /* Store (optional) */; \ 89 faligndata %F2, %F4, %A2 /* FPA Group */; \ 90 srl %x6, 1, %x6 /* IEU0 */; \ 91 add %sum, %x5, %sum /* IEU1 */; \ 92 fpadd32 %F2, %f2, %F2 /* FPA Group */; \ 93 add %src, 64, %src /* IEU0 */; \ 94 fcmpgt32 %f0, %F0, %x1 /* FPM */; \ 95 add %dst, 64, %dst /* IEU1 Group */; \ 96 inc %x7 /* IEU0 */; \ 97 STORE3 /* Store (optional) */; \ 98 faligndata %F4, %F6, %A4 /* FPA */; \ 99 fpadd32 %F4, %f4, %F4 /* FPA Group */; \ 100 add %sum, %x6, %sum /* IEU1 */; \ 101 fcmpgt32 %f2, %F2, %x2 /* FPM */; \ 102 srl %x7, 1, %x7 /* IEU0 Group */; \ 103 inc %x8 /* IEU1 */; \ 104 STORE4 /* Store (optional) */; \ 105 faligndata %F6, %F8, %A6 /* FPA */; \ 106 fpadd32 %F6, %f6, %F6 /* FPA Group */; \ 107 srl %x8, 1, %x8 /* IEU0 */; \ 108 fcmpgt32 %f4, %F4, %x3 /* FPM */; \ 109 add %sum, %x7, %sum /* IEU0 Group */; \ 110 inc %x1 /* IEU1 */; \ 111 STORE5 /* Store (optional) */; \ 112 faligndata %F8, %F10, %A8 /* FPA */; \ 113 fpadd32 %F8, %f8, %F8 /* FPA Group */; \ 114 srl %x1, 1, %x1 /* IEU0 */; \ 115 fcmpgt32 %f6, %F6, %x4 /* FPM */; \ 116 add %sum, %x8, %sum /* IEU0 Group */; \ 117 inc %x2 /* IEU1 */; \ 118 STORE6 /* Store (optional) */; \ 119 faligndata %F10, %F12, %A10 /* FPA */; \ 120 fpadd32 %F10, %f10, %F10 /* FPA Group */; \ 121 srl %x2, 1, %x2 /* IEU0 */; \ 122 fcmpgt32 %f8, %F8, %x5 /* FPM */; \ 123 add %sum, %x1, %sum /* IEU0 Group */; \ 124 inc %x3 /* IEU1 */; \ 125 STORE7 /* Store (optional) */; \ 126 faligndata %F12, %F14, %A12 /* FPA */; \ 127 fpadd32 %F12, %f12, %F12 /* FPA Group */; \ 128 srl %x3, 1, %x3 /* IEU0 */; \ 129 fcmpgt32 %f10, %F10, %x6 /* FPM */; \ 130 add %sum, %x2, %sum /* IEU0 Group */; \ 131 inc %x4 /* IEU1 */; \ 132 STORE8 /* Store (optional) */; \ 133 fmovd %F14, %B14 /* FPA */; \ 134 fpadd32 %F14, %f14, %F14 /* FPA Group */; \ 135 srl %x4, 1, %x4 /* IEU0 */; \ 136 fcmpgt32 %f12, %F12, %x7 /* FPM */; \ 137 add %sum, %x3, %sum /* IEU0 Group */; \ 138 subcc %len, 64, %len /* IEU1 */; \ 139 BRANCH /* CTI */; \ 140 fcmpgt32 %f14, %F14, %x8 /* FPM Group */; 141 142#define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \ 143 inc %x5 /* IEU0 Group */; \ 144 fpadd32 %f2, %f0, %S0 /* FPA */; \ 145 add %sum, %x4, %sum /* IEU1 */; \ 146 srl %x5, 1, %x5 /* IEU0 Group */; \ 147 fpadd32 %f6, %f4, %S1 /* FPA */; \ 148 inc %x6 /* IEU1 */; \ 149 fpadd32 %f10, %f8, %S2 /* FPA Group */; \ 150 add %sum, %x5, %sum /* IEU0 */; \ 151 fcmpgt32 %f0, %S0, %x1 /* FPM */; \ 152 fpadd32 %f14, %f12, %S3 /* FPA Group */; \ 153 srl %x6, 1, %x6 /* IEU0 */; \ 154 fcmpgt32 %f4, %S1, %x2 /* FPM */; \ 155 add %sum, %x6, %sum /* IEU0 Group */; \ 156 fzero %fz /* FPA */; \ 157 fcmpgt32 %f8, %S2, %x3 /* FPM */; \ 158 inc %x7 /* IEU0 Group */; \ 159 inc %x8 /* IEU1 */; \ 160 srl %x7, 1, %x7 /* IEU0 Group */; \ 161 inc %x1 /* IEU1 */; \ 162 fpadd32 %S0, %S1, %T0 /* FPA */; \ 163 fpadd32 %S2, %S3, %T1 /* FPA Group */; \ 164 add %sum, %x7, %sum /* IEU0 */; \ 165 fcmpgt32 %f12, %S3, %x4 /* FPM */; \ 166 srl %x8, 1, %x8 /* IEU0 Group */; \ 167 inc %x2 /* IEU1 */; \ 168 srl %x1, 1, %x1 /* IEU0 Group */; \ 169 add %sum, %x8, %sum /* IEU1 */; \ 170 add %sum, %x1, %sum /* IEU0 Group */; \ 171 fcmpgt32 %S0, %T0, %x5 /* FPM */; \ 172 srl %x2, 1, %x2 /* IEU0 Group */; \ 173 fcmpgt32 %S2, %T1, %x6 /* FPM */; \ 174 inc %x3 /* IEU0 Group */; \ 175 add %sum, %x2, %sum /* IEU1 */; \ 176 srl %x3, 1, %x3 /* IEU0 Group */; \ 177 inc %x4 /* IEU1 */; \ 178 fpadd32 %T0, %T1, %U0 /* FPA Group */; \ 179 add %sum, %x3, %sum /* IEU0 */; \ 180 fcmpgt32 %fz, %f2, %x7 /* FPM */; \ 181 srl %x4, 1, %x4 /* IEU0 Group */; \ 182 fcmpgt32 %fz, %f6, %x8 /* FPM */; \ 183 inc %x5 /* IEU0 Group */; \ 184 add %sum, %x4, %sum /* IEU1 */; \ 185 srl %x5, 1, %x5 /* IEU0 Group */; \ 186 fcmpgt32 %fz, %f10, %x1 /* FPM */; \ 187 inc %x6 /* IEU0 Group */; \ 188 add %sum, %x5, %sum /* IEU1 */; \ 189 fmovd %FA, %FB /* FPA Group */; \ 190 fcmpgt32 %fz, %f14, %x2 /* FPM */; \ 191 srl %x6, 1, %x6 /* IEU0 Group */; \ 192 ba,pt %xcc, ett /* CTI */; \ 193 inc %x7 /* IEU1 */; 194 195#define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) \ 196 END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62) 197 198#define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) \ 199 fpadd32 %U0, %U1, %V0 /* FPA Group */; \ 200 srl %x7, 1, %x7 /* IEU0 */; \ 201 add %sum, %x6, %sum /* IEU1 */; \ 202 std %V0, [%sp + STACKOFF] /* Store Group */; \ 203 inc %x8 /* IEU0 */; \ 204 sub %sum, %x7, %sum /* IEU1 */; \ 205 srl %x8, 1, %x8 /* IEU0 Group */; \ 206 fcmpgt32 %fz, %S1, %x3 /* FPM */; \ 207 inc %x1 /* IEU0 Group */; \ 208 fcmpgt32 %fz, %S3, %x4 /* FPM */; \ 209 srl %x1, 1, %x1 /* IEU0 Group */; \ 210 sub %sum, %x8, %sum /* IEU1 */; \ 211 ldx [%sp + STACKOFF], %x8 /* Load Group */; \ 212 inc %x2 /* IEU0 */; \ 213 sub %sum, %x1, %sum /* IEU1 */; \ 214 srl %x2, 1, %x2 /* IEU0 Group */; \ 215 fcmpgt32 %fz, %T1, %x5 /* FPM */; \ 216 inc %x3 /* IEU0 Group */; \ 217 fcmpgt32 %T0, %U0, %x6 /* FPM */; \ 218 srl %x3, 1, %x3 /* IEU0 Group */; \ 219 sub %sum, %x2, %sum /* IEU1 */; \ 220 inc %x4 /* IEU0 Group */; \ 221 sub %sum, %x3, %sum /* IEU1 */; \ 222 srl %x4, 1, %x4 /* IEU0 Group */; \ 223 fcmpgt32 %fz, %U1, %x7 /* FPM */; \ 224 inc %x5 /* IEU0 Group */; \ 225 fcmpgt32 %U0, %V0, %x1 /* FPM */; \ 226 srl %x5, 1, %x5 /* IEU0 Group */; \ 227 sub %sum, %x4, %sum /* IEU1 */; \ 228 sub %sum, %x5, %sum /* IEU0 Group */; \ 229 fcmpgt32 %fz, %V0, %x2 /* FPM */; \ 230 inc %x6 /* IEU0 Group */; \ 231 inc %x7 /* IEU1 */; \ 232 srl %x6, 1, %x6 /* IEU0 Group */; \ 233 inc %x1 /* IEU1 */; \ 234 srl %x7, 1, %x7 /* IEU0 Group */; \ 235 add %sum, %x6, %sum /* IEU1 */; \ 236 srl %x1, 1, %x1 /* IEU0 Group */; \ 237 sub %sum, %x7, %sum /* IEU1 */; \ 238 inc %x2 /* IEU0 Group */; \ 239 add %sum, %x1, %sum /* IEU1 */; \ 240 srl %x2, 1, %x2 /* IEU0 Group */; \ 241 sub %sum, %x2, %sum /* IEU0 Group */; \ 242 addcc %sum, %x8, %sum /* IEU1 Group */; \ 243 bcs,a,pn %xcc, 33f /* CTI */; \ 244 add %sum, 1, %sum /* IEU0 (Group) */; \ 24533: /* That's it */; 246 247 .text 248 .globl csum_partial_copy_vis 249 .align 32 250/* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp. 251 * csum_partial_copy_from_user 252 * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256 253 */ 254csum_partial_copy_vis: 255 andcc %dst, 7, %g0 /* IEU1 Group */ 256 be,pt %icc, 4f /* CTI */ 257 and %dst, 0x38, %o4 /* IEU0 */ 258 mov 1, %g5 /* IEU0 Group */ 259 andcc %dst, 2, %g0 /* IEU1 */ 260 be,pt %icc, 1f /* CTI */ 261 and %dst, 4, %g7 /* IEU0 Group */ 262 lduha [%src] %asi, %g2 /* Load */ 263 sub %len, 2, %len /* IEU0 Group */ 264 add %dst, 2, %dst /* IEU1 */ 265 andcc %dst, 4, %g7 /* IEU1 Group */ 266 sll %g5, 16, %g5 /* IEU0 */ 267 sth %g2, [%dst - 2] /* Store Group */ 268 sll %g2, 16, %g2 /* IEU0 */ 269 add %src, 2, %src /* IEU1 */ 270 addcc %g2, %sum, %sum /* IEU1 Group */ 271 bcs,a,pn %icc, 1f /* CTI */ 272 add %sum, %g5, %sum /* IEU0 */ 2731: lduwa [%src] %asi, %g2 /* Load */ 274 brz,a,pn %g7, 4f /* CTI+IEU1 Group */ 275 and %dst, 0x38, %o4 /* IEU0 */ 276 add %dst, 4, %dst /* IEU0 Group */ 277 sub %len, 4, %len /* IEU1 */ 278 addcc %g2, %sum, %sum /* IEU1 Group */ 279 bcs,a,pn %icc, 1f /* CTI */ 280 add %sum, 1, %sum /* IEU0 */ 2811: and %dst, 0x38, %o4 /* IEU0 Group */ 282 stw %g2, [%dst - 4] /* Store */ 283 add %src, 4, %src /* IEU1 */ 2844: 285#ifdef __KERNEL__ 286 VISEntry 287#endif 288 mov %src, %g7 /* IEU1 Group */ 289 fzero %f48 /* FPA */ 290 alignaddr %src, %g0, %src /* Single Group */ 291 subcc %g7, %src, %g7 /* IEU1 Group */ 292 be,pt %xcc, 1f /* CTI */ 293 mov 0x40, %g1 /* IEU0 */ 294 lduwa [%src] %asi, %g2 /* Load Group */ 295 subcc %sum, %g2, %sum /* IEU1 Group+load stall*/ 296 bcs,a,pn %icc, 1f /* CTI */ 297 sub %sum, 1, %sum /* IEU0 */ 2981: srl %sum, 0, %sum /* IEU0 Group */ 299 clr %g5 /* IEU1 */ 300 brz,pn %o4, 3f /* CTI+IEU1 Group */ 301 sub %g1, %o4, %g1 /* IEU0 */ 302 ldda [%src] %asi, %f0 /* Load */ 303 clr %o4 /* IEU0 Group */ 304 andcc %dst, 8, %g0 /* IEU1 */ 305 be,pn %icc, 1f /* CTI */ 306 ldda [%src + 8] %asi, %f2 /* Load Group */ 307 add %src, 8, %src /* IEU0 */ 308 sub %len, 8, %len /* IEU1 */ 309 fpadd32 %f0, %f48, %f50 /* FPA */ 310 addcc %dst, 8, %dst /* IEU1 Group */ 311 faligndata %f0, %f2, %f16 /* FPA */ 312 fcmpgt32 %f48, %f50, %o4 /* FPM Group */ 313 fmovd %f2, %f0 /* FPA Group */ 314 ldda [%src + 8] %asi, %f2 /* Load */ 315 std %f16, [%dst - 8] /* Store */ 316 fmovd %f50, %f48 /* FPA */ 3171: andcc %g1, 0x10, %g0 /* IEU1 Group */ 318 be,pn %icc, 1f /* CTI */ 319 and %g1, 0x20, %g1 /* IEU0 */ 320 fpadd32 %f0, %f48, %f50 /* FPA */ 321 ldda [%src + 16] %asi, %f4 /* Load Group */ 322 add %src, 16, %src /* IEU0 */ 323 add %dst, 16, %dst /* IEU1 */ 324 faligndata %f0, %f2, %f16 /* FPA */ 325 fcmpgt32 %f48, %f50, %g5 /* FPM Group */ 326 sub %len, 16, %len /* IEU0 */ 327 inc %o4 /* IEU1 */ 328 std %f16, [%dst - 16] /* Store Group */ 329 fpadd32 %f2, %f50, %f48 /* FPA */ 330 srl %o4, 1, %o5 /* IEU0 */ 331 faligndata %f2, %f4, %f18 /* FPA Group */ 332 std %f18, [%dst - 8] /* Store */ 333 fcmpgt32 %f50, %f48, %o4 /* FPM Group */ 334 add %o5, %sum, %sum /* IEU0 */ 335 ldda [%src + 8] %asi, %f2 /* Load */ 336 fmovd %f4, %f0 /* FPA */ 3371: brz,a,pn %g1, 4f /* CTI+IEU1 Group */ 338 rd %asi, %g2 /* LSU Group + 4 bubbles*/ 339 inc %g5 /* IEU0 */ 340 fpadd32 %f0, %f48, %f50 /* FPA */ 341 ldda [%src + 16] %asi, %f4 /* Load Group */ 342 srl %g5, 1, %g5 /* IEU0 */ 343 add %dst, 32, %dst /* IEU1 */ 344 faligndata %f0, %f2, %f16 /* FPA */ 345 fcmpgt32 %f48, %f50, %o5 /* FPM Group */ 346 inc %o4 /* IEU0 */ 347 ldda [%src + 24] %asi, %f6 /* Load */ 348 srl %o4, 1, %o4 /* IEU0 Group */ 349 add %g5, %sum, %sum /* IEU1 */ 350 ldda [%src + 32] %asi, %f8 /* Load */ 351 fpadd32 %f2, %f50, %f48 /* FPA */ 352 faligndata %f2, %f4, %f18 /* FPA Group */ 353 sub %len, 32, %len /* IEU0 */ 354 std %f16, [%dst - 32] /* Store */ 355 fcmpgt32 %f50, %f48, %g3 /* FPM Group */ 356 inc %o5 /* IEU0 */ 357 add %o4, %sum, %sum /* IEU1 */ 358 fpadd32 %f4, %f48, %f50 /* FPA */ 359 faligndata %f4, %f6, %f20 /* FPA Group */ 360 srl %o5, 1, %o5 /* IEU0 */ 361 fcmpgt32 %f48, %f50, %g5 /* FPM Group */ 362 add %o5, %sum, %sum /* IEU0 */ 363 std %f18, [%dst - 24] /* Store */ 364 fpadd32 %f6, %f50, %f48 /* FPA */ 365 inc %g3 /* IEU0 Group */ 366 std %f20, [%dst - 16] /* Store */ 367 add %src, 32, %src /* IEU1 */ 368 faligndata %f6, %f8, %f22 /* FPA */ 369 fcmpgt32 %f50, %f48, %o4 /* FPM Group */ 370 srl %g3, 1, %g3 /* IEU0 */ 371 std %f22, [%dst - 8] /* Store */ 372 add %g3, %sum, %sum /* IEU0 Group */ 3733: rd %asi, %g2 /* LSU Group + 4 bubbles*/ 374#ifdef __KERNEL__ 3754: sethi %hi(vis0s), %g7 /* IEU0 Group */ 376 or %g2, ASI_BLK_OR, %g2 /* IEU1 */ 377#else 3784: rd %pc, %g7 /* LSU Group + 4 bubbles*/ 379#endif 380 inc %g5 /* IEU0 Group */ 381 and %src, 0x38, %g3 /* IEU1 */ 382 membar #StoreLoad /* LSU Group */ 383 srl %g5, 1, %g5 /* IEU0 */ 384 inc %o4 /* IEU1 */ 385 sll %g3, 8, %g3 /* IEU0 Group */ 386 sub %len, 0xc0, %len /* IEU1 */ 387 addcc %g5, %sum, %sum /* IEU1 Group */ 388 srl %o4, 1, %o4 /* IEU0 */ 389 add %g7, %g3, %g7 /* IEU0 Group */ 390 add %o4, %sum, %sum /* IEU1 */ 391#ifdef __KERNEL__ 392 jmpl %g7 + %lo(vis0s), %g0 /* CTI+IEU1 Group */ 393#else 394 jmpl %g7 + (vis0s - 4b), %g0 /* CTI+IEU1 Group */ 395#endif 396 fzero %f32 /* FPA */ 397 398 .align 2048 399vis0s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ 400 add %src, 128, %src /* IEU0 Group */ 401 ldda [%src-128] %asi, %f0 /* Load Group */ 402 ldda [%src-64] %asi, %f16 /* Load Group */ 403 fmovd %f48, %f62 /* FPA Group f0 available*/ 404 faligndata %f0, %f2, %f48 /* FPA Group f2 available*/ 405 fcmpgt32 %f32, %f2, %x1 /* FPM Group f4 available*/ 406 fpadd32 %f0, %f62, %f0 /* FPA */ 407 fcmpgt32 %f32, %f4, %x2 /* FPM Group f6 available*/ 408 faligndata %f2, %f4, %f50 /* FPA */ 409 fcmpgt32 %f62, %f0, %x3 /* FPM Group f8 available*/ 410 faligndata %f4, %f6, %f52 /* FPA */ 411 fcmpgt32 %f32, %f6, %x4 /* FPM Group f10 available*/ 412 inc %x1 /* IEU0 */ 413 faligndata %f6, %f8, %f54 /* FPA */ 414 fcmpgt32 %f32, %f8, %x5 /* FPM Group f12 available*/ 415 srl %x1, 1, %x1 /* IEU0 */ 416 inc %x2 /* IEU1 */ 417 faligndata %f8, %f10, %f56 /* FPA */ 418 fcmpgt32 %f32, %f10, %x6 /* FPM Group f14 available*/ 419 srl %x2, 1, %x2 /* IEU0 */ 420 add %sum, %x1, %sum /* IEU1 */ 421 faligndata %f10, %f12, %f58 /* FPA */ 422 fcmpgt32 %f32, %f12, %x7 /* FPM Group */ 423 inc %x3 /* IEU0 */ 424 add %sum, %x2, %sum /* IEU1 */ 425 faligndata %f12, %f14, %f60 /* FPA */ 426 fcmpgt32 %f32, %f14, %x8 /* FPM Group */ 427 srl %x3, 1, %x3 /* IEU0 */ 428 inc %x4 /* IEU1 */ 429 fmovd %f14, %f62 /* FPA */ 430 srl %x4, 1, %x4 /* IEU0 Group */ 431 add %sum, %x3, %sum /* IEU1 */ 432vis0: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 433 ,f48,f50,f52,f54,f56,f58,f60,f62,f62, 434 ,LDBLK(f32), STBLK,,,,,,,, 435 ,bcs,pn %icc, vis0e1) 436 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 437 ,f48,f50,f52,f54,f56,f58,f60,f62,f62, 438 ,LDBLK(f0), STBLK,,,,,,,, 439 ,bcs,pn %icc, vis0e2) 440 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 441 ,f48,f50,f52,f54,f56,f58,f60,f62,f62, 442 ,LDBLK(f16), STBLK,,,,,,,, 443 ,bcc,pt %icc, vis0) 444vis0e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 445 ,f48,f50,f52,f54,f56,f58,f60,f62,f32, 446 ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), 447 ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2) 448vis0e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 449 ,f48,f50,f52,f54,f56,f58,f60,f62,f0, 450 ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), 451 ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3) 452vis0e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 453 ,f48,f50,f52,f54,f56,f58,f60,f62,f16, 454 ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48), 455 ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1) 456 .align 2048 457vis1s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ 458 add %src, 128 - 8, %src /* IEU0 Group */ 459 ldda [%src-128] %asi, %f0 /* Load Group */ 460 ldda [%src-64] %asi, %f16 /* Load Group */ 461 fmovd %f0, %f58 /* FPA Group */ 462 fmovd %f48, %f0 /* FPA Group */ 463 fcmpgt32 %f32, %f2, %x2 /* FPM Group */ 464 faligndata %f2, %f4, %f48 /* FPA */ 465 fcmpgt32 %f32, %f4, %x3 /* FPM Group */ 466 faligndata %f4, %f6, %f50 /* FPA */ 467 fcmpgt32 %f32, %f6, %x4 /* FPM Group */ 468 faligndata %f6, %f8, %f52 /* FPA */ 469 fcmpgt32 %f32, %f8, %x5 /* FPM Group */ 470 inc %x2 /* IEU1 */ 471 faligndata %f8, %f10, %f54 /* FPA */ 472 fcmpgt32 %f32, %f10, %x6 /* FPM Group */ 473 srl %x2, 1, %x2 /* IEU0 */ 474 faligndata %f10, %f12, %f56 /* FPA */ 475 fcmpgt32 %f32, %f12, %x7 /* FPM Group */ 476 inc %x3 /* IEU0 */ 477 add %sum, %x2, %sum /* IEU1 */ 478 faligndata %f12, %f14, %f58 /* FPA */ 479 fcmpgt32 %f32, %f14, %x8 /* FPM Group */ 480 srl %x3, 1, %x3 /* IEU0 */ 481 inc %x4 /* IEU1 */ 482 fmovd %f14, %f60 /* FPA */ 483 srl %x4, 1, %x4 /* IEU0 Group */ 484 add %sum, %x3, %sum /* IEU1 */ 485vis1: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 486 ,f62,f48,f50,f52,f54,f56,f58,f60,f60, 487 ,LDBLK(f32), ,STBLK,,,,,,, 488 ,bcs,pn %icc, vis1e1) 489 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 490 ,f62,f48,f50,f52,f54,f56,f58,f60,f60, 491 ,LDBLK(f0), ,STBLK,,,,,,, 492 ,bcs,pn %icc, vis1e2) 493 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 494 ,f62,f48,f50,f52,f54,f56,f58,f60,f60, 495 ,LDBLK(f16), ,STBLK,,,,,,, 496 ,bcc,pt %icc, vis1) 497vis1e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 498 ,f62,f48,f50,f52,f54,f56,f58,f60,f32, 499 ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), 500 ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2) 501vis1e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 502 ,f62,f48,f50,f52,f54,f56,f58,f60,f0, 503 ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), 504 ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3) 505vis1e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 506 ,f62,f48,f50,f52,f54,f56,f58,f60,f16, 507 ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40), 508 ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1) 509 .align 2048 510vis2s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ 511 add %src, 128 - 16, %src /* IEU0 Group */ 512 ldda [%src-128] %asi, %f0 /* Load Group */ 513 ldda [%src-64] %asi, %f16 /* Load Group */ 514 fmovd %f0, %f56 /* FPA Group */ 515 fmovd %f48, %f0 /* FPA Group */ 516 sub %dst, 64, %dst /* IEU0 */ 517 fpsub32 %f2, %f2, %f2 /* FPA Group */ 518 fcmpgt32 %f32, %f4, %x3 /* FPM Group */ 519 faligndata %f4, %f6, %f48 /* FPA */ 520 fcmpgt32 %f32, %f6, %x4 /* FPM Group */ 521 faligndata %f6, %f8, %f50 /* FPA */ 522 fcmpgt32 %f32, %f8, %x5 /* FPM Group */ 523 faligndata %f8, %f10, %f52 /* FPA */ 524 fcmpgt32 %f32, %f10, %x6 /* FPM Group */ 525 faligndata %f10, %f12, %f54 /* FPA */ 526 fcmpgt32 %f32, %f12, %x7 /* FPM Group */ 527 inc %x3 /* IEU0 */ 528 faligndata %f12, %f14, %f56 /* FPA */ 529 fcmpgt32 %f32, %f14, %x8 /* FPM Group */ 530 srl %x3, 1, %x3 /* IEU0 */ 531 inc %x4 /* IEU1 */ 532 fmovd %f14, %f58 /* FPA */ 533 srl %x4, 1, %x4 /* IEU0 Group */ 534 add %sum, %x3, %sum /* IEU1 */ 535vis2: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 536 ,f60,f62,f48,f50,f52,f54,f56,f58,f58, 537 ,LDBLK(f32), ,,STBLK,,,,,, 538 ,bcs,pn %icc, vis2e1) 539 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 540 ,f60,f62,f48,f50,f52,f54,f56,f58,f58, 541 ,LDBLK(f0), ,,STBLK,,,,,, 542 ,bcs,pn %icc, vis2e2) 543 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 544 ,f60,f62,f48,f50,f52,f54,f56,f58,f58, 545 ,LDBLK(f16), ,,STBLK,,,,,, 546 ,bcc,pt %icc, vis2) 547vis2e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 548 ,f60,f62,f48,f50,f52,f54,f56,f58,f32, 549 ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), 550 ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2) 551vis2e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 552 ,f60,f62,f48,f50,f52,f54,f56,f58,f0, 553 ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), 554 ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3) 555vis2e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 556 ,f60,f62,f48,f50,f52,f54,f56,f58,f16, 557 ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96), 558 ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1) 559 .align 2048 560vis3s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ 561 add %src, 128 - 24, %src /* IEU0 Group */ 562 ldda [%src-128] %asi, %f0 /* Load Group */ 563 ldda [%src-64] %asi, %f16 /* Load Group */ 564 fmovd %f0, %f54 /* FPA Group */ 565 fmovd %f48, %f0 /* FPA Group */ 566 sub %dst, 64, %dst /* IEU0 */ 567 fpsub32 %f2, %f2, %f2 /* FPA Group */ 568 fpsub32 %f4, %f4, %f4 /* FPA Group */ 569 fcmpgt32 %f32, %f6, %x4 /* FPM Group */ 570 faligndata %f6, %f8, %f48 /* FPA */ 571 fcmpgt32 %f32, %f8, %x5 /* FPM Group */ 572 faligndata %f8, %f10, %f50 /* FPA */ 573 fcmpgt32 %f32, %f10, %x6 /* FPM Group */ 574 faligndata %f10, %f12, %f52 /* FPA */ 575 fcmpgt32 %f32, %f12, %x7 /* FPM Group */ 576 faligndata %f12, %f14, %f54 /* FPA */ 577 fcmpgt32 %f32, %f14, %x8 /* FPM Group */ 578 fmovd %f14, %f56 /* FPA */ 579 inc %x4 /* IEU0 */ 580 srl %x4, 1, %x4 /* IEU0 Group */ 581vis3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 582 ,f58,f60,f62,f48,f50,f52,f54,f56,f56, 583 ,LDBLK(f32), ,,,STBLK,,,,, 584 ,bcs,pn %icc, vis3e1) 585 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 586 ,f58,f60,f62,f48,f50,f52,f54,f56,f56, 587 ,LDBLK(f0), ,,,STBLK,,,,, 588 ,bcs,pn %icc, vis3e2) 589 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 590 ,f58,f60,f62,f48,f50,f52,f54,f56,f56, 591 ,LDBLK(f16), ,,,STBLK,,,,, 592 ,bcc,pt %icc, vis3) 593vis3e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 594 ,f58,f60,f62,f48,f50,f52,f54,f56,f32, 595 ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), 596 ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2) 597vis3e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 598 ,f58,f60,f62,f48,f50,f52,f54,f56,f0, 599 ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), 600 ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3) 601vis3e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 602 ,f58,f60,f62,f48,f50,f52,f54,f56,f16, 603 ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88), 604 ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1) 605 .align 2048 606vis4s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ 607 add %src, 128 - 32, %src /* IEU0 Group */ 608 ldda [%src-128] %asi, %f0 /* Load Group */ 609 ldda [%src-64] %asi, %f16 /* Load Group */ 610 fmovd %f0, %f52 /* FPA Group */ 611 fmovd %f48, %f0 /* FPA Group */ 612 sub %dst, 64, %dst /* IEU0 */ 613 fpsub32 %f2, %f2, %f2 /* FPA Group */ 614 fpsub32 %f4, %f4, %f4 /* FPA Group */ 615 fpsub32 %f6, %f6, %f6 /* FPA Group */ 616 clr %x4 /* IEU0 */ 617 fcmpgt32 %f32, %f8, %x5 /* FPM Group */ 618 faligndata %f8, %f10, %f48 /* FPA */ 619 fcmpgt32 %f32, %f10, %x6 /* FPM Group */ 620 faligndata %f10, %f12, %f50 /* FPA */ 621 fcmpgt32 %f32, %f12, %x7 /* FPM Group */ 622 faligndata %f12, %f14, %f52 /* FPA */ 623 fcmpgt32 %f32, %f14, %x8 /* FPM Group */ 624 fmovd %f14, %f54 /* FPA */ 625vis4: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 626 ,f56,f58,f60,f62,f48,f50,f52,f54,f54, 627 ,LDBLK(f32), ,,,,STBLK,,,, 628 ,bcs,pn %icc, vis4e1) 629 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 630 ,f56,f58,f60,f62,f48,f50,f52,f54,f54, 631 ,LDBLK(f0), ,,,,STBLK,,,, 632 ,bcs,pn %icc, vis4e2) 633 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 634 ,f56,f58,f60,f62,f48,f50,f52,f54,f54, 635 ,LDBLK(f16), ,,,,STBLK,,,, 636 ,bcc,pt %icc, vis4) 637vis4e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 638 ,f56,f58,f60,f62,f48,f50,f52,f54,f32, 639 ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80), 640 ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2) 641vis4e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 642 ,f56,f58,f60,f62,f48,f50,f52,f54,f0, 643 ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80), 644 ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3) 645vis4e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 646 ,f56,f58,f60,f62,f48,f50,f52,f54,f16, 647 ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80), 648 ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1) 649 .align 2048 650vis5s: add %src, 128 - 40, %src /* IEU0 Group */ 651 ldda [%src-88] %asi, %f10 /* Load Group */ 652 ldda [%src-80] %asi, %f12 /* Load Group */ 653 ldda [%src-72] %asi, %f14 /* Load Group */ 654 wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ 655 ldda [%src-64] %asi, %f16 /* Load Group */ 656 fmovd %f48, %f0 /* FPA Group */ 657 fmuld %f32, %f32, %f2 /* FPM */ 658 clr %x4 /* IEU0 */ 659 faddd %f32, %f32, %f4 /* FPA Group */ 660 fmuld %f32, %f32, %f6 /* FPM */ 661 clr %x5 /* IEU0 */ 662 faddd %f32, %f32, %f8 /* FPA Group */ 663 fcmpgt32 %f32, %f10, %x6 /* FPM Group */ 664 sub %dst, 64, %dst /* IEU0 */ 665 faligndata %f10, %f12, %f48 /* FPA */ 666 fcmpgt32 %f32, %f12, %x7 /* FPM Group */ 667 faligndata %f12, %f14, %f50 /* FPA */ 668 fcmpgt32 %f32, %f14, %x8 /* FPM Group */ 669 fmovd %f14, %f52 /* FPA */ 670vis5: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 671 ,f54,f56,f58,f60,f62,f48,f50,f52,f52, 672 ,LDBLK(f32), ,,,,,STBLK,,, 673 ,bcs,pn %icc, vis5e1) 674 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 675 ,f54,f56,f58,f60,f62,f48,f50,f52,f52, 676 ,LDBLK(f0), ,,,,,STBLK,,, 677 ,bcs,pn %icc, vis5e2) 678 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 679 ,f54,f56,f58,f60,f62,f48,f50,f52,f52, 680 ,LDBLK(f16), ,,,,,STBLK,,, 681 ,bcc,pt %icc, vis5) 682vis5e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 683 ,f54,f56,f58,f60,f62,f48,f50,f52,f32, 684 ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72), 685 ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2) 686vis5e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 687 ,f54,f56,f58,f60,f62,f48,f50,f52,f0, 688 ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72), 689 ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3) 690vis5e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 691 ,f54,f56,f58,f60,f62,f48,f50,f52,f16, 692 ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72), 693 ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1) 694 .align 2048 695vis6s: add %src, 128 - 48, %src /* IEU0 Group */ 696 ldda [%src-80] %asi, %f12 /* Load Group */ 697 ldda [%src-72] %asi, %f14 /* Load Group */ 698 wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ 699 ldda [%src-64] %asi, %f16 /* Load Group */ 700 fmovd %f48, %f0 /* FPA Group */ 701 fmuld %f32, %f32, %f2 /* FPM */ 702 clr %x4 /* IEU0 */ 703 faddd %f32, %f32, %f4 /* FPA Group */ 704 fmuld %f32, %f32, %f6 /* FPM */ 705 clr %x5 /* IEU0 */ 706 faddd %f32, %f32, %f8 /* FPA Group */ 707 fmuld %f32, %f32, %f10 /* FPM */ 708 clr %x6 /* IEU0 */ 709 fcmpgt32 %f32, %f12, %x7 /* FPM Group */ 710 sub %dst, 64, %dst /* IEU0 */ 711 fcmpgt32 %f32, %f14, %x8 /* FPM Group */ 712 faligndata %f12, %f14, %f48 /* FPA */ 713 fmovd %f14, %f50 /* FPA Group */ 714vis6: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 715 ,f52,f54,f56,f58,f60,f62,f48,f50,f50, 716 ,LDBLK(f32), ,,,,,,STBLK,, 717 ,bcs,pn %icc, vis6e1) 718 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 719 ,f52,f54,f56,f58,f60,f62,f48,f50,f50, 720 ,LDBLK(f0), ,,,,,,STBLK,, 721 ,bcs,pn %icc, vis6e2) 722 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 723 ,f52,f54,f56,f58,f60,f62,f48,f50,f50, 724 ,LDBLK(f16), ,,,,,,STBLK,, 725 ,bcc,pt %icc, vis6) 726vis6e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 727 ,f52,f54,f56,f58,f60,f62,f48,f50,f32, 728 ,SYNC, ,,,,,,STBLK,ST(f48,64), 729 ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2) 730vis6e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 731 ,f52,f54,f56,f58,f60,f62,f48,f50,f0, 732 ,SYNC, ,,,,,,STBLK,ST(f48,64), 733 ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3) 734vis6e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 735 ,f52,f54,f56,f58,f60,f62,f48,f50,f16, 736 ,SYNC, ,,,,,,STBLK,ST(f48,64), 737 ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1) 738 .align 2048 739vis7s: add %src, 128 - 56, %src /* IEU0 Group */ 740 ldda [%src-72] %asi, %f14 /* Load Group */ 741 wr %g2, ASI_BLK_XOR, %asi /* LSU Group */ 742 ldda [%src-64] %asi, %f16 /* Load Group */ 743 fmovd %f48, %f0 /* FPA Group */ 744 fmuld %f32, %f32, %f2 /* FPM */ 745 clr %x4 /* IEU0 */ 746 faddd %f32, %f32, %f4 /* FPA Group */ 747 fmuld %f32, %f32, %f6 /* FPM */ 748 clr %x5 /* IEU0 */ 749 faddd %f32, %f32, %f8 /* FPA Group */ 750 fmuld %f32, %f32, %f10 /* FPM */ 751 clr %x6 /* IEU0 */ 752 faddd %f32, %f32, %f12 /* FPA Group */ 753 clr %x7 /* IEU0 */ 754 fcmpgt32 %f32, %f14, %x8 /* FPM Group */ 755 sub %dst, 64, %dst /* IEU0 */ 756 fmovd %f14, %f48 /* FPA */ 757vis7: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 758 ,f50,f52,f54,f56,f58,f60,f62,f48,f48, 759 ,LDBLK(f32), ,,,,,,,STBLK, 760 ,bcs,pn %icc, vis7e1) 761 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 762 ,f50,f52,f54,f56,f58,f60,f62,f48,f48, 763 ,LDBLK(f0), ,,,,,,,STBLK, 764 ,bcs,pn %icc, vis7e2) 765 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 766 ,f50,f52,f54,f56,f58,f60,f62,f48,f48, 767 ,LDBLK(f16), ,,,,,,,STBLK, 768 ,bcc,pt %icc, vis7) 769vis7e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30, 770 ,f50,f52,f54,f56,f58,f60,f62,f48,f32, 771 ,SYNC, ,,,,,,,STBLK, 772 ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2) 773vis7e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46, 774 ,f50,f52,f54,f56,f58,f60,f62,f48,f0, 775 ,SYNC, ,,,,,,,STBLK, 776 ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3) 777vis7e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14, 778 ,f50,f52,f54,f56,f58,f60,f62,f48,f16, 779 ,SYNC, ,,,,,,,STBLK, 780 ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1) 781e1: END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6) 782e2: END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6) 783e3: END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6) 784ett: rd %asi, %x4 /* LSU Group+4bubbles */ 785 rd %gsr, %x3 /* LSU Group+4bubbles */ 786#ifdef __KERNEL__ 787 srl %x4, 3, %x5 /* IEU0 Group */ 788 xor %x4, ASI_BLK_XOR1, %x4 /* IEU1 */ 789 wr %x4, %x5, %asi /* LSU Group+4bubbles */ 790#else 791 wr %x4, ASI_BLK_XOR, %asi /* LSU Group+4bubbles */ 792#endif 793 andcc %x3, 7, %x3 /* IEU1 Group */ 794 add %dst, 8, %dst /* IEU0 */ 795 bne,pn %icc, 1f /* CTI */ 796 fzero %f10 /* FPA */ 797 brz,a,pn %len, 2f /* CTI+IEU1 Group */ 798 std %f6, [%dst - 8] /* Store */ 7991: cmp %len, 8 /* IEU1 */ 800 blu,pn %icc, 3f /* CTI */ 801 sub %src, 64, %src /* IEU0 Group */ 8021: ldda [%src] %asi, %f2 /* Load Group */ 803 fpadd32 %f10, %f2, %f12 /* FPA Group+load stall*/ 804 add %src, 8, %src /* IEU0 */ 805 add %dst, 8, %dst /* IEU1 */ 806 faligndata %f6, %f2, %f14 /* FPA Group */ 807 fcmpgt32 %f10, %f12, %x5 /* FPM Group */ 808 std %f14, [%dst - 16] /* Store */ 809 fmovd %f2, %f6 /* FPA */ 810 fmovd %f12, %f10 /* FPA Group */ 811 sub %len, 8, %len /* IEU1 */ 812 fzero %f16 /* FPA Group - FPU nop */ 813 fzero %f18 /* FPA Group - FPU nop */ 814 inc %x5 /* IEU0 */ 815 srl %x5, 1, %x5 /* IEU0 Group (regdep) */ 816 cmp %len, 8 /* IEU1 */ 817 bgeu,pt %icc, 1b /* CTI */ 818 add %x5, %sum, %sum /* IEU0 Group */ 8193: brz,a,pt %x3, 2f /* CTI+IEU1 */ 820 std %f6, [%dst - 8] /* Store Group */ 821 st %f7, [%dst - 8] /* Store Group */ 822 sub %dst, 4, %dst /* IEU0 */ 823 add %len, 4, %len /* IEU1 */ 8242: 825#ifdef __KERNEL__ 826 sub %sp, 8, %sp /* IEU0 Group */ 827#endif 828 END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62) 829 membar #Sync /* LSU Group */ 830#ifdef __KERNEL__ 831 VISExit 832 add %sp, 8, %sp /* IEU0 Group */ 833#endif 83423: brnz,pn %len, 26f /* CTI+IEU1 Group */ 83524: sllx %sum, 32, %g1 /* IEU0 */ 83625: addcc %sum, %g1, %src /* IEU1 Group */ 837 srlx %src, 32, %src /* IEU0 Group (regdep) */ 838 bcs,a,pn %xcc, 1f /* CTI */ 839 add %src, 1, %src /* IEU1 */ 840#ifndef __KERNEL__ 8411: retl /* CTI Group brk forced*/ 842 srl %src, 0, %src /* IEU0 */ 843#else 8441: sethi %uhi(PAGE_OFFSET), %g4 /* IEU0 Group */ 845 retl /* CTI Group brk forced*/ 846 sllx %g4, 32, %g4 /* IEU0 */ 847#endif 84826: andcc %len, 8, %g0 /* IEU1 Group */ 849 be,pn %icc, 1f /* CTI */ 850 lduwa [%src] %asi, %o4 /* Load */ 851 lduwa [%src+4] %asi, %g2 /* Load Group */ 852 add %src, 8, %src /* IEU0 */ 853 add %dst, 8, %dst /* IEU1 */ 854 sllx %o4, 32, %g5 /* IEU0 Group */ 855 stw %o4, [%dst - 8] /* Store */ 856 or %g5, %g2, %g5 /* IEU0 Group */ 857 stw %g2, [%dst - 4] /* Store */ 858 addcc %g5, %sum, %sum /* IEU1 Group */ 859 bcs,a,pn %xcc, 1f /* CTI */ 860 add %sum, 1, %sum /* IEU0 */ 8611: andcc %len, 4, %g0 /* IEU1 Group */ 862 be,a,pn %icc, 1f /* CTI */ 863 clr %g2 /* IEU0 */ 864 lduwa [%src] %asi, %g7 /* Load */ 865 add %src, 4, %src /* IEU0 Group */ 866 add %dst, 4, %dst /* IEU1 */ 867 sllx %g7, 32, %g2 /* IEU0 Group */ 868 stw %g7, [%dst - 4] /* Store */ 8691: andcc %len, 2, %g0 /* IEU1 */ 870 be,a,pn %icc, 1f /* CTI */ 871 clr %g3 /* IEU0 Group */ 872 lduha [%src] %asi, %g7 /* Load */ 873 add %src, 2, %src /* IEU1 */ 874 add %dst, 2, %dst /* IEU0 Group */ 875 sll %g7, 16, %g3 /* IEU0 Group */ 876 sth %g7, [%dst - 2] /* Store */ 8771: andcc %len, 1, %g0 /* IEU1 */ 878 be,a,pn %icc, 1f /* CTI */ 879 clr %o5 /* IEU0 Group */ 880 lduba [%src] %asi, %g7 /* Load */ 881 sll %g7, 8, %o5 /* IEU0 Group */ 882 stb %g7, [%dst] /* Store */ 8831: or %g2, %g3, %g3 /* IEU1 */ 884 or %o5, %g3, %g3 /* IEU0 Group (regdep) */ 885 addcc %g3, %sum, %sum /* IEU1 Group (regdep) */ 886 bcs,a,pn %xcc, 1f /* CTI */ 887 add %sum, 1, %sum /* IEU0 */ 8881: ba,pt %xcc, 25b /* CTI Group */ 889 sllx %sum, 32, %g1 /* IEU0 */ 890 891#ifdef __KERNEL__ 892end: 893 894 .section __ex_table 895 .align 4 896 .word csum_partial_copy_vis, 0, end, cpc_handler 897#endif 898