1dnl SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb 2dnl number and add the result to a n limb vector. 3 4dnl Copyright 2002, 2003 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C UltraSPARC 1&2: 9 25C UltraSPARC 3: 10 26 27C Algorithm: We use 16 floating-point multiplies per limb product, with the 28C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand 29C split into 32-bit pieces. We sum four 48-bit partial products using 30C floating-point add, then convert the resulting four 50-bit quantities and 31C transfer them to the integer unit. 32 33C Possible optimizations: 34C 1. Align the stack area where we transfer the four 50-bit product-sums 35C to a 32-byte boundary. That would minimize the cache collision. 36C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would 37C be to align the area to map to the area immediately before up?) 38C 2. Perform two of the fp->int conversions with integer instructions. We 39C can get almost ten free IEU slots, if we clean up bookkeeping and the 40C silly carry-limb code. 41C 3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb 42C code. 43 44C OSP (Overlapping software pipeline) version of mpn_mul_basecase: 45C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles. 46C FI = 20 47C L = 9 x un * vn 48C WDFI = 10 x vn / 2 49C WD = 4 50 51C Instruction classification (as per UltraSPARC functional units). 52C Assuming silly carry code is fixed. Includes bookkeeping. 53C 54C mpn_addmul_X mpn_mul_X 55C 1 2 1 2 56C ========== ========== 57C FM 8 16 8 16 58C FA 10 18 10 18 59C MEM 12 12 10 10 60C ISHIFT 6 6 6 6 61C IADDLOG 11 11 10 10 62C BRANCH 1 1 1 1 63C 64C TOTAL IEU 17 17 16 16 65C TOTAL 48 64 45 61 66C 67C IEU cycles 8.5 8.5 8 8 68C MEM cycles 12 12 10 10 69C ISSUE cycles 12 16 11.25 15.25 70C FPU cycles 10 18 10 18 71C cycles/loop 12 18 12 18 72C cycles/limb 12 9 12 9 73 74 75C INPUT PARAMETERS 76C rp[n + 1] i0 77C up[n] i1 78C n i2 79C vp[2] i3 80 81 82ASM_START() 83 REGISTER(%g2,#scratch) 84 REGISTER(%g3,#scratch) 85 86C Combine registers: 87C u00_hi= u32_hi 88C u00_lo= u32_lo 89C a000 = out000 90C a016 = out016 91C Free: f52 f54 92 93 94define(`p000', `%f8') define(`p016',`%f10') 95define(`p032',`%f12') define(`p048',`%f14') 96define(`p064',`%f16') define(`p080',`%f18') 97define(`p096a',`%f20') define(`p112a',`%f22') 98define(`p096b',`%f56') define(`p112b',`%f58') 99 100define(`out000',`%f0') define(`out016',`%f6') 101 102define(`v000',`%f24') define(`v016',`%f26') 103define(`v032',`%f28') define(`v048',`%f30') 104define(`v064',`%f44') define(`v080',`%f46') 105define(`v096',`%f48') define(`v112',`%f50') 106 107define(`u00',`%f32') define(`u32', `%f34') 108 109define(`a000',`%f36') define(`a016',`%f38') 110define(`a032',`%f40') define(`a048',`%f42') 111define(`a064',`%f60') define(`a080',`%f62') 112 113define(`u00_hi',`%f2') define(`u32_hi',`%f4') 114define(`u00_lo',`%f3') define(`u32_lo',`%f5') 115 116define(`cy',`%g1') 117define(`rlimb',`%g3') 118define(`i00',`%l0') define(`i16',`%l1') 119define(`r00',`%l2') define(`r32',`%l3') 120define(`xffffffff',`%l7') 121define(`xffff',`%o0') 122 123 124PROLOGUE(mpn_addmul_2) 125 126C Initialization. (1) Split v operand into eight 16-bit chunks and store them 127C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs 128C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. 129C This code could be better scheduled. 130 131 save %sp, -256, %sp 132 133ifdef(`HAVE_VIS', 134` mov -1, %g4 135 wr %g0, 0xD2, %asi 136 srlx %g4, 32, xffffffff C store mask in register `xffffffff' 137 ldda [%i3+6] %asi, v000 138 ldda [%i3+4] %asi, v016 139 ldda [%i3+2] %asi, v032 140 ldda [%i3+0] %asi, v048 141 fxtod v000, v000 142 ldda [%i3+14] %asi, v064 143 fxtod v016, v016 144 ldda [%i3+12] %asi, v080 145 fxtod v032, v032 146 ldda [%i3+10] %asi, v096 147 fxtod v048, v048 148 ldda [%i3+8] %asi, v112 149 fxtod v064, v064 150 fxtod v080, v080 151 fxtod v096, v096 152 fxtod v112, v112 153 fzero u00_hi 154 fzero u32_hi 155', 156` mov -1, %g4 157 ldx [%i3+0], %l0 C vp[0] 158 srlx %g4, 48, xffff C store mask in register `xffff' 159 ldx [%i3+8], %l1 C vp[1] 160 161 and %l0, xffff, %g2 162 stx %g2, [%sp+2223+0] 163 srlx %l0, 16, %g3 164 and %g3, xffff, %g3 165 stx %g3, [%sp+2223+8] 166 srlx %l0, 32, %g2 167 and %g2, xffff, %g2 168 stx %g2, [%sp+2223+16] 169 srlx %l0, 48, %g3 170 stx %g3, [%sp+2223+24] 171 and %l1, xffff, %g2 172 stx %g2, [%sp+2223+32] 173 srlx %l1, 16, %g3 174 and %g3, xffff, %g3 175 stx %g3, [%sp+2223+40] 176 srlx %l1, 32, %g2 177 and %g2, xffff, %g2 178 stx %g2, [%sp+2223+48] 179 srlx %l1, 48, %g3 180 stx %g3, [%sp+2223+56] 181 182 srlx %g4, 32, xffffffff C store mask in register `xffffffff' 183 184 ldd [%sp+2223+0], v000 185 ldd [%sp+2223+8], v016 186 ldd [%sp+2223+16], v032 187 ldd [%sp+2223+24], v048 188 fxtod v000, v000 189 ldd [%sp+2223+32], v064 190 fxtod v016, v016 191 ldd [%sp+2223+40], v080 192 fxtod v032, v032 193 ldd [%sp+2223+48], v096 194 fxtod v048, v048 195 ldd [%sp+2223+56], v112 196 fxtod v064, v064 197 ld [%sp+2223+0], u00_hi C zero u00_hi 198 fxtod v080, v080 199 ld [%sp+2223+0], u32_hi C zero u32_hi 200 fxtod v096, v096 201 fxtod v112, v112 202') 203C Initialization done. 204 mov 0, %g2 205 mov 0, rlimb 206 mov 0, %g4 207 add %i0, -8, %i0 C BOOKKEEPING 208 209C Start software pipeline. 210 211 ld [%i1+4], u00_lo C read low 32 bits of up[i] 212 fxtod u00_hi, u00 213C mid 214 ld [%i1+0], u32_lo C read high 32 bits of up[i] 215 fmuld u00, v000, a000 216 fmuld u00, v016, a016 217 fmuld u00, v032, a032 218 fmuld u00, v048, a048 219 add %i2, -1, %i2 C BOOKKEEPING 220 fmuld u00, v064, p064 221 add %i1, 8, %i1 C BOOKKEEPING 222 fxtod u32_hi, u32 223 fmuld u00, v080, p080 224 fmuld u00, v096, p096a 225 brnz,pt %i2, .L_2_or_more 226 fmuld u00, v112, p112a 227 228.L1: fdtox a000, out000 229 fmuld u32, v000, p000 230 fdtox a016, out016 231 fmuld u32, v016, p016 232 fmovd p064, a064 233 fmuld u32, v032, p032 234 fmovd p080, a080 235 fmuld u32, v048, p048 236 std out000, [%sp+2223+16] 237 faddd p000, a032, a000 238 fmuld u32, v064, p064 239 std out016, [%sp+2223+24] 240 fxtod u00_hi, u00 241 faddd p016, a048, a016 242 fmuld u32, v080, p080 243 faddd p032, a064, a032 244 fmuld u32, v096, p096b 245 faddd p048, a080, a048 246 fmuld u32, v112, p112b 247C mid 248 fdtox a000, out000 249 fdtox a016, out016 250 faddd p064, p096a, a064 251 faddd p080, p112a, a080 252 std out000, [%sp+2223+0] 253 b .L_wd2 254 std out016, [%sp+2223+8] 255 256.L_2_or_more: 257 ld [%i1+4], u00_lo C read low 32 bits of up[i] 258 fdtox a000, out000 259 fmuld u32, v000, p000 260 fdtox a016, out016 261 fmuld u32, v016, p016 262 fmovd p064, a064 263 fmuld u32, v032, p032 264 fmovd p080, a080 265 fmuld u32, v048, p048 266 std out000, [%sp+2223+16] 267 faddd p000, a032, a000 268 fmuld u32, v064, p064 269 std out016, [%sp+2223+24] 270 fxtod u00_hi, u00 271 faddd p016, a048, a016 272 fmuld u32, v080, p080 273 faddd p032, a064, a032 274 fmuld u32, v096, p096b 275 faddd p048, a080, a048 276 fmuld u32, v112, p112b 277C mid 278 ld [%i1+0], u32_lo C read high 32 bits of up[i] 279 fdtox a000, out000 280 fmuld u00, v000, p000 281 fdtox a016, out016 282 fmuld u00, v016, p016 283 faddd p064, p096a, a064 284 fmuld u00, v032, p032 285 faddd p080, p112a, a080 286 fmuld u00, v048, p048 287 add %i2, -1, %i2 C BOOKKEEPING 288 std out000, [%sp+2223+0] 289 faddd p000, a032, a000 290 fmuld u00, v064, p064 291 add %i1, 8, %i1 C BOOKKEEPING 292 std out016, [%sp+2223+8] 293 fxtod u32_hi, u32 294 faddd p016, a048, a016 295 fmuld u00, v080, p080 296 faddd p032, a064, a032 297 fmuld u00, v096, p096a 298 faddd p048, a080, a048 299 brnz,pt %i2, .L_3_or_more 300 fmuld u00, v112, p112a 301 302 b .Lend 303 nop 304 305C 64 32 0 306C . . . 307C . |__rXXX_| 32 308C . |___cy___| 34 309C . |_______i00__| 50 310C |_______i16__| . 50 311 312 313C BEGIN MAIN LOOP 314 .align 16 315.L_3_or_more: 316.Loop: ld [%i1+4], u00_lo C read low 32 bits of up[i] 317 and %g2, xffffffff, %g2 318 fdtox a000, out000 319 fmuld u32, v000, p000 320C 321 lduw [%i0+4+8], r00 C read low 32 bits of rp[i] 322 add %g2, rlimb, %l5 323 fdtox a016, out016 324 fmuld u32, v016, p016 325C 326 srlx %l5, 32, cy 327 ldx [%sp+2223+16], i00 328 faddd p064, p096b, a064 329 fmuld u32, v032, p032 330C 331 add %g4, cy, cy C new cy 332 ldx [%sp+2223+24], i16 333 faddd p080, p112b, a080 334 fmuld u32, v048, p048 335C 336 nop 337 std out000, [%sp+2223+16] 338 faddd p000, a032, a000 339 fmuld u32, v064, p064 340C 341 add i00, r00, rlimb 342 add %i0, 8, %i0 C BOOKKEEPING 343 std out016, [%sp+2223+24] 344 fxtod u00_hi, u00 345C 346 sllx i16, 16, %g2 347 add cy, rlimb, rlimb 348 faddd p016, a048, a016 349 fmuld u32, v080, p080 350C 351 srlx i16, 16, %g4 352 add %g2, rlimb, %l5 353 faddd p032, a064, a032 354 fmuld u32, v096, p096b 355C 356 stw %l5, [%i0+4] 357 nop 358 faddd p048, a080, a048 359 fmuld u32, v112, p112b 360C midloop 361 ld [%i1+0], u32_lo C read high 32 bits of up[i] 362 and %g2, xffffffff, %g2 363 fdtox a000, out000 364 fmuld u00, v000, p000 365C 366 lduw [%i0+0], r32 C read high 32 bits of rp[i] 367 add %g2, rlimb, %l5 368 fdtox a016, out016 369 fmuld u00, v016, p016 370C 371 srlx %l5, 32, cy 372 ldx [%sp+2223+0], i00 373 faddd p064, p096a, a064 374 fmuld u00, v032, p032 375C 376 add %g4, cy, cy C new cy 377 ldx [%sp+2223+8], i16 378 faddd p080, p112a, a080 379 fmuld u00, v048, p048 380C 381 add %i2, -1, %i2 C BOOKKEEPING 382 std out000, [%sp+2223+0] 383 faddd p000, a032, a000 384 fmuld u00, v064, p064 385C 386 add i00, r32, rlimb 387 add %i1, 8, %i1 C BOOKKEEPING 388 std out016, [%sp+2223+8] 389 fxtod u32_hi, u32 390C 391 sllx i16, 16, %g2 392 add cy, rlimb, rlimb 393 faddd p016, a048, a016 394 fmuld u00, v080, p080 395C 396 srlx i16, 16, %g4 397 add %g2, rlimb, %l5 398 faddd p032, a064, a032 399 fmuld u00, v096, p096a 400C 401 stw %l5, [%i0+0] 402 faddd p048, a080, a048 403 brnz,pt %i2, .Loop 404 fmuld u00, v112, p112a 405C END MAIN LOOP 406 407C WIND-DOWN PHASE 1 408.Lend: and %g2, xffffffff, %g2 409 fdtox a000, out000 410 fmuld u32, v000, p000 411 lduw [%i0+4+8], r00 C read low 32 bits of rp[i] 412 add %g2, rlimb, %l5 413 fdtox a016, out016 414 fmuld u32, v016, p016 415 srlx %l5, 32, cy 416 ldx [%sp+2223+16], i00 417 faddd p064, p096b, a064 418 fmuld u32, v032, p032 419 add %g4, cy, cy C new cy 420 ldx [%sp+2223+24], i16 421 faddd p080, p112b, a080 422 fmuld u32, v048, p048 423 std out000, [%sp+2223+16] 424 faddd p000, a032, a000 425 fmuld u32, v064, p064 426 add i00, r00, rlimb 427 add %i0, 8, %i0 C BOOKKEEPING 428 std out016, [%sp+2223+24] 429 sllx i16, 16, %g2 430 add cy, rlimb, rlimb 431 faddd p016, a048, a016 432 fmuld u32, v080, p080 433 srlx i16, 16, %g4 434 add %g2, rlimb, %l5 435 faddd p032, a064, a032 436 fmuld u32, v096, p096b 437 stw %l5, [%i0+4] 438 faddd p048, a080, a048 439 fmuld u32, v112, p112b 440C mid 441 and %g2, xffffffff, %g2 442 fdtox a000, out000 443 lduw [%i0+0], r32 C read high 32 bits of rp[i] 444 add %g2, rlimb, %l5 445 fdtox a016, out016 446 srlx %l5, 32, cy 447 ldx [%sp+2223+0], i00 448 faddd p064, p096a, a064 449 add %g4, cy, cy C new cy 450 ldx [%sp+2223+8], i16 451 faddd p080, p112a, a080 452 std out000, [%sp+2223+0] 453 add i00, r32, rlimb 454 std out016, [%sp+2223+8] 455 sllx i16, 16, %g2 456 add cy, rlimb, rlimb 457 srlx i16, 16, %g4 458 add %g2, rlimb, %l5 459 stw %l5, [%i0+0] 460 461C WIND-DOWN PHASE 2 462.L_wd2: and %g2, xffffffff, %g2 463 fdtox a032, out000 464 lduw [%i0+4+8], r00 C read low 32 bits of rp[i] 465 add %g2, rlimb, %l5 466 fdtox a048, out016 467 srlx %l5, 32, cy 468 ldx [%sp+2223+16], i00 469 add %g4, cy, cy C new cy 470 ldx [%sp+2223+24], i16 471 std out000, [%sp+2223+16] 472 add i00, r00, rlimb 473 add %i0, 8, %i0 C BOOKKEEPING 474 std out016, [%sp+2223+24] 475 sllx i16, 16, %g2 476 add cy, rlimb, rlimb 477 srlx i16, 16, %g4 478 add %g2, rlimb, %l5 479 stw %l5, [%i0+4] 480C mid 481 and %g2, xffffffff, %g2 482 fdtox a064, out000 483 lduw [%i0+0], r32 C read high 32 bits of rp[i] 484 add %g2, rlimb, %l5 485 fdtox a080, out016 486 srlx %l5, 32, cy 487 ldx [%sp+2223+0], i00 488 add %g4, cy, cy C new cy 489 ldx [%sp+2223+8], i16 490 std out000, [%sp+2223+0] 491 add i00, r32, rlimb 492 std out016, [%sp+2223+8] 493 sllx i16, 16, %g2 494 add cy, rlimb, rlimb 495 srlx i16, 16, %g4 496 add %g2, rlimb, %l5 497 stw %l5, [%i0+0] 498 499C WIND-DOWN PHASE 3 500.L_wd3: and %g2, xffffffff, %g2 501 fdtox p096b, out000 502 add %g2, rlimb, %l5 503 fdtox p112b, out016 504 srlx %l5, 32, cy 505 ldx [%sp+2223+16], rlimb 506 add %g4, cy, cy C new cy 507 ldx [%sp+2223+24], i16 508 std out000, [%sp+2223+16] 509 add %i0, 8, %i0 C BOOKKEEPING 510 std out016, [%sp+2223+24] 511 sllx i16, 16, %g2 512 add cy, rlimb, rlimb 513 srlx i16, 16, %g4 514 add %g2, rlimb, %l5 515 stw %l5, [%i0+4] 516C mid 517 and %g2, xffffffff, %g2 518 add %g2, rlimb, %l5 519 srlx %l5, 32, cy 520 ldx [%sp+2223+0], rlimb 521 add %g4, cy, cy C new cy 522 ldx [%sp+2223+8], i16 523 sllx i16, 16, %g2 524 add cy, rlimb, rlimb 525 srlx i16, 16, %g4 526 add %g2, rlimb, %l5 527 stw %l5, [%i0+0] 528 529 and %g2, xffffffff, %g2 530 add %g2, rlimb, %l5 531 srlx %l5, 32, cy 532 ldx [%sp+2223+16], i00 533 add %g4, cy, cy C new cy 534 ldx [%sp+2223+24], i16 535 536 sllx i16, 16, %g2 537 add i00, cy, cy 538 return %i7+8 539 add %g2, cy, %o0 540EPILOGUE(mpn_addmul_2) 541