1dnl SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb 2dnl number and add the result to a n limb vector. 3 4dnl Copyright 2002, 2003 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34C cycles/limb 35C UltraSPARC 1&2: 9 36C UltraSPARC 3: 10 37 38C Algorithm: We use 16 floating-point multiplies per limb product, with the 39C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand 40C split into 32-bit pieces. We sum four 48-bit partial products using 41C floating-point add, then convert the resulting four 50-bit quantities and 42C transfer them to the integer unit. 43 44C Possible optimizations: 45C 1. Align the stack area where we transfer the four 50-bit product-sums 46C to a 32-byte boundary. That would minimize the cache collision. 47C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would 48C be to align the area to map to the area immediately before up?) 49C 2. Perform two of the fp->int conversions with integer instructions. We 50C can get almost ten free IEU slots, if we clean up bookkeeping and the 51C silly carry-limb code. 52C 3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb 53C code. 54 55C OSP (Overlapping software pipeline) version of mpn_mul_basecase: 56C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles. 57C FI = 20 58C L = 9 x un * vn 59C WDFI = 10 x vn / 2 60C WD = 4 61 62C Instruction classification (as per UltraSPARC functional units). 63C Assuming silly carry code is fixed. Includes bookkeeping. 64C 65C mpn_addmul_X mpn_mul_X 66C 1 2 1 2 67C ========== ========== 68C FM 8 16 8 16 69C FA 10 18 10 18 70C MEM 12 12 10 10 71C ISHIFT 6 6 6 6 72C IADDLOG 11 11 10 10 73C BRANCH 1 1 1 1 74C 75C TOTAL IEU 17 17 16 16 76C TOTAL 48 64 45 61 77C 78C IEU cycles 8.5 8.5 8 8 79C MEM cycles 12 12 10 10 80C ISSUE cycles 12 16 11.25 15.25 81C FPU cycles 10 18 10 18 82C cycles/loop 12 18 12 18 83C cycles/limb 12 9 12 9 84 85 86C INPUT PARAMETERS 87C rp[n + 1] i0 88C up[n] i1 89C n i2 90C vp[2] i3 91 92 93ASM_START() 94 REGISTER(%g2,#scratch) 95 REGISTER(%g3,#scratch) 96 97C Combine registers: 98C u00_hi= u32_hi 99C u00_lo= u32_lo 100C a000 = out000 101C a016 = out016 102C Free: f52 f54 103 104 105define(`p000', `%f8') define(`p016',`%f10') 106define(`p032',`%f12') define(`p048',`%f14') 107define(`p064',`%f16') define(`p080',`%f18') 108define(`p096a',`%f20') define(`p112a',`%f22') 109define(`p096b',`%f56') define(`p112b',`%f58') 110 111define(`out000',`%f0') define(`out016',`%f6') 112 113define(`v000',`%f24') define(`v016',`%f26') 114define(`v032',`%f28') define(`v048',`%f30') 115define(`v064',`%f44') define(`v080',`%f46') 116define(`v096',`%f48') define(`v112',`%f50') 117 118define(`u00',`%f32') define(`u32', `%f34') 119 120define(`a000',`%f36') define(`a016',`%f38') 121define(`a032',`%f40') define(`a048',`%f42') 122define(`a064',`%f60') define(`a080',`%f62') 123 124define(`u00_hi',`%f2') define(`u32_hi',`%f4') 125define(`u00_lo',`%f3') define(`u32_lo',`%f5') 126 127define(`cy',`%g1') 128define(`rlimb',`%g3') 129define(`i00',`%l0') define(`i16',`%l1') 130define(`r00',`%l2') define(`r32',`%l3') 131define(`xffffffff',`%l7') 132define(`xffff',`%o0') 133 134 135PROLOGUE(mpn_addmul_2) 136 137C Initialization. (1) Split v operand into eight 16-bit chunks and store them 138C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs 139C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. 140C This code could be better scheduled. 141 142 save %sp, -256, %sp 143 144ifdef(`HAVE_VIS', 145` mov -1, %g4 146 wr %g0, 0xD2, %asi 147 srlx %g4, 32, xffffffff C store mask in register `xffffffff' 148 ldda [%i3+6] %asi, v000 149 ldda [%i3+4] %asi, v016 150 ldda [%i3+2] %asi, v032 151 ldda [%i3+0] %asi, v048 152 fxtod v000, v000 153 ldda [%i3+14] %asi, v064 154 fxtod v016, v016 155 ldda [%i3+12] %asi, v080 156 fxtod v032, v032 157 ldda [%i3+10] %asi, v096 158 fxtod v048, v048 159 ldda [%i3+8] %asi, v112 160 fxtod v064, v064 161 fxtod v080, v080 162 fxtod v096, v096 163 fxtod v112, v112 164 fzero u00_hi 165 fzero u32_hi 166', 167` mov -1, %g4 168 ldx [%i3+0], %l0 C vp[0] 169 srlx %g4, 48, xffff C store mask in register `xffff' 170 ldx [%i3+8], %l1 C vp[1] 171 172 and %l0, xffff, %g2 173 stx %g2, [%sp+2223+0] 174 srlx %l0, 16, %g3 175 and %g3, xffff, %g3 176 stx %g3, [%sp+2223+8] 177 srlx %l0, 32, %g2 178 and %g2, xffff, %g2 179 stx %g2, [%sp+2223+16] 180 srlx %l0, 48, %g3 181 stx %g3, [%sp+2223+24] 182 and %l1, xffff, %g2 183 stx %g2, [%sp+2223+32] 184 srlx %l1, 16, %g3 185 and %g3, xffff, %g3 186 stx %g3, [%sp+2223+40] 187 srlx %l1, 32, %g2 188 and %g2, xffff, %g2 189 stx %g2, [%sp+2223+48] 190 srlx %l1, 48, %g3 191 stx %g3, [%sp+2223+56] 192 193 srlx %g4, 32, xffffffff C store mask in register `xffffffff' 194 195 ldd [%sp+2223+0], v000 196 ldd [%sp+2223+8], v016 197 ldd [%sp+2223+16], v032 198 ldd [%sp+2223+24], v048 199 fxtod v000, v000 200 ldd [%sp+2223+32], v064 201 fxtod v016, v016 202 ldd [%sp+2223+40], v080 203 fxtod v032, v032 204 ldd [%sp+2223+48], v096 205 fxtod v048, v048 206 ldd [%sp+2223+56], v112 207 fxtod v064, v064 208 ld [%sp+2223+0], u00_hi C zero u00_hi 209 fxtod v080, v080 210 ld [%sp+2223+0], u32_hi C zero u32_hi 211 fxtod v096, v096 212 fxtod v112, v112 213') 214C Initialization done. 215 mov 0, %g2 216 mov 0, rlimb 217 mov 0, %g4 218 add %i0, -8, %i0 C BOOKKEEPING 219 220C Start software pipeline. 221 222 ld [%i1+4], u00_lo C read low 32 bits of up[i] 223 fxtod u00_hi, u00 224C mid 225 ld [%i1+0], u32_lo C read high 32 bits of up[i] 226 fmuld u00, v000, a000 227 fmuld u00, v016, a016 228 fmuld u00, v032, a032 229 fmuld u00, v048, a048 230 add %i2, -1, %i2 C BOOKKEEPING 231 fmuld u00, v064, p064 232 add %i1, 8, %i1 C BOOKKEEPING 233 fxtod u32_hi, u32 234 fmuld u00, v080, p080 235 fmuld u00, v096, p096a 236 brnz,pt %i2, .L_2_or_more 237 fmuld u00, v112, p112a 238 239.L1: fdtox a000, out000 240 fmuld u32, v000, p000 241 fdtox a016, out016 242 fmuld u32, v016, p016 243 fmovd p064, a064 244 fmuld u32, v032, p032 245 fmovd p080, a080 246 fmuld u32, v048, p048 247 std out000, [%sp+2223+16] 248 faddd p000, a032, a000 249 fmuld u32, v064, p064 250 std out016, [%sp+2223+24] 251 fxtod u00_hi, u00 252 faddd p016, a048, a016 253 fmuld u32, v080, p080 254 faddd p032, a064, a032 255 fmuld u32, v096, p096b 256 faddd p048, a080, a048 257 fmuld u32, v112, p112b 258C mid 259 fdtox a000, out000 260 fdtox a016, out016 261 faddd p064, p096a, a064 262 faddd p080, p112a, a080 263 std out000, [%sp+2223+0] 264 b .L_wd2 265 std out016, [%sp+2223+8] 266 267.L_2_or_more: 268 ld [%i1+4], u00_lo C read low 32 bits of up[i] 269 fdtox a000, out000 270 fmuld u32, v000, p000 271 fdtox a016, out016 272 fmuld u32, v016, p016 273 fmovd p064, a064 274 fmuld u32, v032, p032 275 fmovd p080, a080 276 fmuld u32, v048, p048 277 std out000, [%sp+2223+16] 278 faddd p000, a032, a000 279 fmuld u32, v064, p064 280 std out016, [%sp+2223+24] 281 fxtod u00_hi, u00 282 faddd p016, a048, a016 283 fmuld u32, v080, p080 284 faddd p032, a064, a032 285 fmuld u32, v096, p096b 286 faddd p048, a080, a048 287 fmuld u32, v112, p112b 288C mid 289 ld [%i1+0], u32_lo C read high 32 bits of up[i] 290 fdtox a000, out000 291 fmuld u00, v000, p000 292 fdtox a016, out016 293 fmuld u00, v016, p016 294 faddd p064, p096a, a064 295 fmuld u00, v032, p032 296 faddd p080, p112a, a080 297 fmuld u00, v048, p048 298 add %i2, -1, %i2 C BOOKKEEPING 299 std out000, [%sp+2223+0] 300 faddd p000, a032, a000 301 fmuld u00, v064, p064 302 add %i1, 8, %i1 C BOOKKEEPING 303 std out016, [%sp+2223+8] 304 fxtod u32_hi, u32 305 faddd p016, a048, a016 306 fmuld u00, v080, p080 307 faddd p032, a064, a032 308 fmuld u00, v096, p096a 309 faddd p048, a080, a048 310 brnz,pt %i2, .L_3_or_more 311 fmuld u00, v112, p112a 312 313 b .Lend 314 nop 315 316C 64 32 0 317C . . . 318C . |__rXXX_| 32 319C . |___cy___| 34 320C . |_______i00__| 50 321C |_______i16__| . 50 322 323 324C BEGIN MAIN LOOP 325 .align 16 326.L_3_or_more: 327.Loop: ld [%i1+4], u00_lo C read low 32 bits of up[i] 328 and %g2, xffffffff, %g2 329 fdtox a000, out000 330 fmuld u32, v000, p000 331C 332 lduw [%i0+4+8], r00 C read low 32 bits of rp[i] 333 add %g2, rlimb, %l5 334 fdtox a016, out016 335 fmuld u32, v016, p016 336C 337 srlx %l5, 32, cy 338 ldx [%sp+2223+16], i00 339 faddd p064, p096b, a064 340 fmuld u32, v032, p032 341C 342 add %g4, cy, cy C new cy 343 ldx [%sp+2223+24], i16 344 faddd p080, p112b, a080 345 fmuld u32, v048, p048 346C 347 nop 348 std out000, [%sp+2223+16] 349 faddd p000, a032, a000 350 fmuld u32, v064, p064 351C 352 add i00, r00, rlimb 353 add %i0, 8, %i0 C BOOKKEEPING 354 std out016, [%sp+2223+24] 355 fxtod u00_hi, u00 356C 357 sllx i16, 16, %g2 358 add cy, rlimb, rlimb 359 faddd p016, a048, a016 360 fmuld u32, v080, p080 361C 362 srlx i16, 16, %g4 363 add %g2, rlimb, %l5 364 faddd p032, a064, a032 365 fmuld u32, v096, p096b 366C 367 stw %l5, [%i0+4] 368 nop 369 faddd p048, a080, a048 370 fmuld u32, v112, p112b 371C midloop 372 ld [%i1+0], u32_lo C read high 32 bits of up[i] 373 and %g2, xffffffff, %g2 374 fdtox a000, out000 375 fmuld u00, v000, p000 376C 377 lduw [%i0+0], r32 C read high 32 bits of rp[i] 378 add %g2, rlimb, %l5 379 fdtox a016, out016 380 fmuld u00, v016, p016 381C 382 srlx %l5, 32, cy 383 ldx [%sp+2223+0], i00 384 faddd p064, p096a, a064 385 fmuld u00, v032, p032 386C 387 add %g4, cy, cy C new cy 388 ldx [%sp+2223+8], i16 389 faddd p080, p112a, a080 390 fmuld u00, v048, p048 391C 392 add %i2, -1, %i2 C BOOKKEEPING 393 std out000, [%sp+2223+0] 394 faddd p000, a032, a000 395 fmuld u00, v064, p064 396C 397 add i00, r32, rlimb 398 add %i1, 8, %i1 C BOOKKEEPING 399 std out016, [%sp+2223+8] 400 fxtod u32_hi, u32 401C 402 sllx i16, 16, %g2 403 add cy, rlimb, rlimb 404 faddd p016, a048, a016 405 fmuld u00, v080, p080 406C 407 srlx i16, 16, %g4 408 add %g2, rlimb, %l5 409 faddd p032, a064, a032 410 fmuld u00, v096, p096a 411C 412 stw %l5, [%i0+0] 413 faddd p048, a080, a048 414 brnz,pt %i2, .Loop 415 fmuld u00, v112, p112a 416C END MAIN LOOP 417 418C WIND-DOWN PHASE 1 419.Lend: and %g2, xffffffff, %g2 420 fdtox a000, out000 421 fmuld u32, v000, p000 422 lduw [%i0+4+8], r00 C read low 32 bits of rp[i] 423 add %g2, rlimb, %l5 424 fdtox a016, out016 425 fmuld u32, v016, p016 426 srlx %l5, 32, cy 427 ldx [%sp+2223+16], i00 428 faddd p064, p096b, a064 429 fmuld u32, v032, p032 430 add %g4, cy, cy C new cy 431 ldx [%sp+2223+24], i16 432 faddd p080, p112b, a080 433 fmuld u32, v048, p048 434 std out000, [%sp+2223+16] 435 faddd p000, a032, a000 436 fmuld u32, v064, p064 437 add i00, r00, rlimb 438 add %i0, 8, %i0 C BOOKKEEPING 439 std out016, [%sp+2223+24] 440 sllx i16, 16, %g2 441 add cy, rlimb, rlimb 442 faddd p016, a048, a016 443 fmuld u32, v080, p080 444 srlx i16, 16, %g4 445 add %g2, rlimb, %l5 446 faddd p032, a064, a032 447 fmuld u32, v096, p096b 448 stw %l5, [%i0+4] 449 faddd p048, a080, a048 450 fmuld u32, v112, p112b 451C mid 452 and %g2, xffffffff, %g2 453 fdtox a000, out000 454 lduw [%i0+0], r32 C read high 32 bits of rp[i] 455 add %g2, rlimb, %l5 456 fdtox a016, out016 457 srlx %l5, 32, cy 458 ldx [%sp+2223+0], i00 459 faddd p064, p096a, a064 460 add %g4, cy, cy C new cy 461 ldx [%sp+2223+8], i16 462 faddd p080, p112a, a080 463 std out000, [%sp+2223+0] 464 add i00, r32, rlimb 465 std out016, [%sp+2223+8] 466 sllx i16, 16, %g2 467 add cy, rlimb, rlimb 468 srlx i16, 16, %g4 469 add %g2, rlimb, %l5 470 stw %l5, [%i0+0] 471 472C WIND-DOWN PHASE 2 473.L_wd2: and %g2, xffffffff, %g2 474 fdtox a032, out000 475 lduw [%i0+4+8], r00 C read low 32 bits of rp[i] 476 add %g2, rlimb, %l5 477 fdtox a048, out016 478 srlx %l5, 32, cy 479 ldx [%sp+2223+16], i00 480 add %g4, cy, cy C new cy 481 ldx [%sp+2223+24], i16 482 std out000, [%sp+2223+16] 483 add i00, r00, rlimb 484 add %i0, 8, %i0 C BOOKKEEPING 485 std out016, [%sp+2223+24] 486 sllx i16, 16, %g2 487 add cy, rlimb, rlimb 488 srlx i16, 16, %g4 489 add %g2, rlimb, %l5 490 stw %l5, [%i0+4] 491C mid 492 and %g2, xffffffff, %g2 493 fdtox a064, out000 494 lduw [%i0+0], r32 C read high 32 bits of rp[i] 495 add %g2, rlimb, %l5 496 fdtox a080, out016 497 srlx %l5, 32, cy 498 ldx [%sp+2223+0], i00 499 add %g4, cy, cy C new cy 500 ldx [%sp+2223+8], i16 501 std out000, [%sp+2223+0] 502 add i00, r32, rlimb 503 std out016, [%sp+2223+8] 504 sllx i16, 16, %g2 505 add cy, rlimb, rlimb 506 srlx i16, 16, %g4 507 add %g2, rlimb, %l5 508 stw %l5, [%i0+0] 509 510C WIND-DOWN PHASE 3 511.L_wd3: and %g2, xffffffff, %g2 512 fdtox p096b, out000 513 add %g2, rlimb, %l5 514 fdtox p112b, out016 515 srlx %l5, 32, cy 516 ldx [%sp+2223+16], rlimb 517 add %g4, cy, cy C new cy 518 ldx [%sp+2223+24], i16 519 std out000, [%sp+2223+16] 520 add %i0, 8, %i0 C BOOKKEEPING 521 std out016, [%sp+2223+24] 522 sllx i16, 16, %g2 523 add cy, rlimb, rlimb 524 srlx i16, 16, %g4 525 add %g2, rlimb, %l5 526 stw %l5, [%i0+4] 527C mid 528 and %g2, xffffffff, %g2 529 add %g2, rlimb, %l5 530 srlx %l5, 32, cy 531 ldx [%sp+2223+0], rlimb 532 add %g4, cy, cy C new cy 533 ldx [%sp+2223+8], i16 534 sllx i16, 16, %g2 535 add cy, rlimb, rlimb 536 srlx i16, 16, %g4 537 add %g2, rlimb, %l5 538 stw %l5, [%i0+0] 539 540 and %g2, xffffffff, %g2 541 add %g2, rlimb, %l5 542 srlx %l5, 32, cy 543 ldx [%sp+2223+16], i00 544 add %g4, cy, cy C new cy 545 ldx [%sp+2223+24], i16 546 547 sllx i16, 16, %g2 548 add i00, cy, cy 549 return %i7+8 550 add %g2, cy, %o0 551EPILOGUE(mpn_addmul_2) 552