1dnl SPARC v9 32-bit mpn_sqr_diagonal. 2 3dnl Copyright 2001, 2003 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20 21include(`../config.m4') 22 23C INPUT PARAMETERS 24C rp i0 25C up i1 26C n i2 27 28C This code uses a very deep software pipeline, due to the need for moving data 29C forth and back between the integer registers and floating-point registers. 30C 31C A VIS variant of this code would make the pipeline less deep, since the 32C masking now done in the integer unit could take place in the floating-point 33C unit using the FAND instruction. It would be possible to save several cycles 34C too. 35C 36C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and 37C not much slower from the Ecache. It would perhaps be possible to shave off 38C one cycle, but not easily. We cannot do better than 10 cycles/limb with the 39C used instructions, since we have 10 memory operations per limb. But a VIS 40C variant could run three cycles faster than the corresponding non-VIS code. 41 42C This is non-pipelined code showing the algorithm: 43C 44C .Loop: 45C lduw [up+0],%g4 C 00000000hhhhllll 46C sllx %g4,16,%g3 C 0000hhhhllll0000 47C or %g3,%g4,%g2 C 0000hhhhXXXXllll 48C andn %g2,%g5,%g2 C 0000hhhh0000llll 49C stx %g2,[%fp+80] 50C ldd [%fp+80],%f0 51C fitod %f0,%f4 C hi16 52C fitod %f1,%f6 C lo16 53C ld [up+0],%f9 54C fxtod %f8,%f2 55C fmuld %f2,%f4,%f4 56C fmuld %f2,%f6,%f6 57C fdtox %f4,%f4 58C fdtox %f6,%f6 59C std %f4,[%fp-24] 60C std %f6,[%fp-16] 61C ldx [%fp-24],%g2 62C ldx [%fp-16],%g1 63C sllx %g2,16,%g2 64C add %g2,%g1,%g1 65C stw %g1,[rp+0] 66C srlx %g1,32,%l0 67C stw %l0,[rp+4] 68C add up,4,up 69C subcc n,1,n 70C bne,pt %icc,.Loop 71C add rp,8,rp 72 73define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe 74 75ASM_START() 76 77 TEXT 78 ALIGN(4) 79.Lnoll: 80 .word 0 81 82PROLOGUE(mpn_sqr_diagonal) 83 save %sp,-256,%sp 84 85ifdef(`PIC', 86`.Lpc: rd %pc,%o7 87 ld [%o7+.Lnoll-.Lpc],%f8', 88` sethi %hi(.Lnoll),%g1 89 ld [%g1+%lo(.Lnoll)],%f8') 90 91 sethi %hi(0xffff0000),%g5 92 add %i1,-8,%i1 93 94 lduw [%i1+8],%g4 95 add %i1,4,%i1 C s1_ptr++ 96 sllx %g4,16,%g3 C 0000hhhhllll0000 97 or %g3,%g4,%g2 C 0000hhhhXXXXllll 98 subcc %i2,1,%i2 99 bne,pt %icc,.L_grt_1 100 andn %g2,%g5,%g2 C 0000hhhh0000llll 101 102 add %i1,4,%i1 C s1_ptr++ 103 stx %g2,[%fp+80] 104 ld [%i1],%f9 105 ldd [%fp+80],%f0 106 fxtod %f8,%f2 107 fitod %f0,%f4 108 fitod %f1,%f6 109 fmuld %f2,%f4,%f4 110 fmuld %f2,%f6,%f6 111 fdtox %f4,%f4 112 fdtox %f6,%f6 113 std %f4,[%fp-24] 114 std %f6,[%fp-16] 115 116 add %fp, 80, %l3 117 add %fp, -24, %l4 118 add %fp, 72, %l5 119 b .L1 120 add %fp, -40, %l6 121 122.L_grt_1: 123 stx %g2,[%fp+80] 124 lduw [%i1+8],%g4 125 add %i1,4,%i1 C s1_ptr++ 126 sllx %g4,16,%g3 C 0000hhhhllll0000 127 or %g3,%g4,%g2 C 0000hhhhXXXXllll 128 subcc %i2,1,%i2 129 bne,pt %icc,.L_grt_2 130 andn %g2,%g5,%g2 C 0000hhhh0000llll 131 132 stx %g2,[%fp+72] 133 ld [%i1],%f9 134 add %i1,4,%i1 C s1_ptr++ 135 ldd [%fp+80],%f0 136 fxtod %f8,%f2 137 fitod %f0,%f4 138 fitod %f1,%f6 139 fmuld %f2,%f4,%f4 140 ld [%i1],%f9 141 fmuld %f2,%f6,%f6 142 ldd [%fp+72],%f0 143 fdtox %f4,%f4 144 fdtox %f6,%f6 145 std %f4,[%fp-24] 146 fxtod %f8,%f2 147 std %f6,[%fp-16] 148 fitod %f0,%f4 149 fitod %f1,%f6 150 fmuld %f2,%f4,%f4 151 fmuld %f2,%f6,%f6 152 fdtox %f4,%f4 153 154 add %fp, 72, %l3 155 add %fp, -40, %l4 156 add %fp, 80, %l5 157 b .L2 158 add %fp, -24, %l6 159 160.L_grt_2: 161 stx %g2,[%fp+72] 162 lduw [%i1+8],%g4 163 ld [%i1],%f9 164 add %i1,4,%i1 C s1_ptr++ 165 ldd [%fp+80],%f0 166 sllx %g4,16,%g3 C 0000hhhhllll0000 167 or %g3,%g4,%g2 C 0000hhhhXXXXllll 168 subcc %i2,1,%i2 169 fxtod %f8,%f2 170 bne,pt %icc,.L_grt_3 171 andn %g2,%g5,%g2 C 0000hhhh0000llll 172 173 stx %g2,[%fp+80] 174 fitod %f0,%f4 175 fitod %f1,%f6 176 fmuld %f2,%f4,%f4 177 ld [%i1],%f9 178 fmuld %f2,%f6,%f6 179 add %i1,4,%i1 C s1_ptr++ 180 ldd [%fp+72],%f0 181 fdtox %f4,%f4 182 fdtox %f6,%f6 183 std %f4,[%fp-24] 184 fxtod %f8,%f2 185 std %f6,[%fp-16] 186 fitod %f0,%f4 187 fitod %f1,%f6 188 fmuld %f2,%f4,%f4 189 ld [%i1],%f9 190 add %fp, 80, %l3 191 fmuld %f2,%f6,%f6 192 add %fp, -24, %l4 193 ldd [%fp+80],%f0 194 add %fp, 72, %l5 195 fdtox %f4,%f4 196 b .L3 197 add %fp, -40, %l6 198 199.L_grt_3: 200 stx %g2,[%fp+80] 201 fitod %f0,%f4 202 lduw [%i1+8],%g4 203 fitod %f1,%f6 204 fmuld %f2,%f4,%f4 205 ld [%i1],%f9 206 fmuld %f2,%f6,%f6 207 add %i1,4,%i1 C s1_ptr++ 208 ldd [%fp+72],%f0 209 fdtox %f4,%f4 210 sllx %g4,16,%g3 C 0000hhhhllll0000 211 fdtox %f6,%f6 212 or %g3,%g4,%g2 C 0000hhhhXXXXllll 213 subcc %i2,1,%i2 214 std %f4,[%fp-24] 215 fxtod %f8,%f2 216 std %f6,[%fp-16] 217 bne,pt %icc,.L_grt_4 218 andn %g2,%g5,%g2 C 0000hhhh0000llll 219 220 stx %g2,[%fp+72] 221 fitod %f0,%f4 222 fitod %f1,%f6 223 add %fp, 72, %l3 224 fmuld %f2,%f4,%f4 225 add %fp, -40, %l4 226 ld [%i1],%f9 227 fmuld %f2,%f6,%f6 228 add %i1,4,%i1 C s1_ptr++ 229 ldd [%fp+80],%f0 230 add %fp, 80, %l5 231 fdtox %f4,%f4 232 b .L4 233 add %fp, -24, %l6 234 235.L_grt_4: 236 stx %g2,[%fp+72] 237 fitod %f0,%f4 238 lduw [%i1+8],%g4 239 fitod %f1,%f6 240 fmuld %f2,%f4,%f4 241 ld [%i1],%f9 242 fmuld %f2,%f6,%f6 243 add %i1,4,%i1 C s1_ptr++ 244 ldd [%fp+80],%f0 245 fdtox %f4,%f4 246 sllx %g4,16,%g3 C 0000hhhhllll0000 247 fdtox %f6,%f6 248 or %g3,%g4,%g2 C 0000hhhhXXXXllll 249 subcc %i2,1,%i2 250 std %f4,[%fp-40] 251 fxtod %f8,%f2 252 std %f6,[%fp-32] 253 be,pn %icc,.L5 254 andn %g2,%g5,%g2 C 0000hhhh0000llll 255 256 b,a .Loop 257 258 .align 16 259C --- LOOP BEGIN 260.Loop: nop 261 nop 262 stx %g2,[%fp+80] 263 fitod %f0,%f4 264C --- 265 nop 266 nop 267 lduw [%i1+8],%g4 268 fitod %f1,%f6 269C --- 270 nop 271 nop 272 ldx [%fp-24],%g2 C p16 273 fanop 274C --- 275 nop 276 nop 277 ldx [%fp-16],%g1 C p0 278 fmuld %f2,%f4,%f4 279C --- 280 sllx %g2,16,%g2 C align p16 281 add %i0,8,%i0 C res_ptr++ 282 ld [%i1],%f9 283 fmuld %f2,%f6,%f6 284C --- 285 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 286 add %i1,4,%i1 C s1_ptr++ 287 ldd [%fp+72],%f0 288 fanop 289C --- 290 srlx %g1,32,%l0 291 nop 292 stw %g1,[%i0-8] 293 fdtox %f4,%f4 294C --- 295 sllx %g4,16,%g3 C 0000hhhhllll0000 296 nop 297 stw %l0,[%i0-4] 298 fdtox %f6,%f6 299C --- 300 or %g3,%g4,%g2 C 0000hhhhXXXXllll 301 subcc %i2,1,%i2 302 std %f4,[%fp-24] 303 fxtod %f8,%f2 304C --- 305 std %f6,[%fp-16] 306 andn %g2,%g5,%g2 C 0000hhhh0000llll 307 be,pn %icc,.Lend 308 fanop 309C --- LOOP MIDDLE 310 nop 311 nop 312 stx %g2,[%fp+72] 313 fitod %f0,%f4 314C --- 315 nop 316 nop 317 lduw [%i1+8],%g4 318 fitod %f1,%f6 319C --- 320 nop 321 nop 322 ldx [%fp-40],%g2 C p16 323 fanop 324C --- 325 nop 326 nop 327 ldx [%fp-32],%g1 C p0 328 fmuld %f2,%f4,%f4 329C --- 330 sllx %g2,16,%g2 C align p16 331 add %i0,8,%i0 C res_ptr++ 332 ld [%i1],%f9 333 fmuld %f2,%f6,%f6 334C --- 335 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 336 add %i1,4,%i1 C s1_ptr++ 337 ldd [%fp+80],%f0 338 fanop 339C --- 340 srlx %g1,32,%l0 341 nop 342 stw %g1,[%i0-8] 343 fdtox %f4,%f4 344C --- 345 sllx %g4,16,%g3 C 0000hhhhllll0000 346 nop 347 stw %l0,[%i0-4] 348 fdtox %f6,%f6 349C --- 350 or %g3,%g4,%g2 C 0000hhhhXXXXllll 351 subcc %i2,1,%i2 352 std %f4,[%fp-40] 353 fxtod %f8,%f2 354C --- 355 std %f6,[%fp-32] 356 andn %g2,%g5,%g2 C 0000hhhh0000llll 357 bne,pt %icc,.Loop 358 fanop 359C --- LOOP END 360 361.L5: add %fp, 80, %l3 362 add %fp, -24, %l4 363 add %fp, 72, %l5 364 b .Ltail 365 add %fp, -40, %l6 366 367.Lend: add %fp, 72, %l3 368 add %fp, -40, %l4 369 add %fp, 80, %l5 370 add %fp, -24, %l6 371.Ltail: stx %g2,[%l3] 372 fitod %f0,%f4 373 fitod %f1,%f6 374 ldx [%l4],%g2 C p16 375 ldx [%l4+8],%g1 C p0 376 fmuld %f2,%f4,%f4 377 sllx %g2,16,%g2 C align p16 378 add %i0,8,%i0 C res_ptr++ 379 ld [%i1],%f9 380 fmuld %f2,%f6,%f6 381 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 382 add %i1,4,%i1 C s1_ptr++ 383 ldd [%l5],%f0 384 srlx %g1,32,%l0 385 stw %g1,[%i0-8] 386 fdtox %f4,%f4 387 stw %l0,[%i0-4] 388.L4: fdtox %f6,%f6 389 std %f4,[%l4] 390 fxtod %f8,%f2 391 std %f6,[%l4+8] 392 393 fitod %f0,%f4 394 fitod %f1,%f6 395 ldx [%l6],%g2 C p16 396 ldx [%l6+8],%g1 C p0 397 fmuld %f2,%f4,%f4 398 sllx %g2,16,%g2 C align p16 399 add %i0,8,%i0 C res_ptr++ 400 ld [%i1],%f9 401 fmuld %f2,%f6,%f6 402 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 403 ldd [%l3],%f0 404 srlx %g1,32,%l0 405 stw %g1,[%i0-8] 406 fdtox %f4,%f4 407 stw %l0,[%i0-4] 408.L3: fdtox %f6,%f6 409 std %f4,[%l6] 410 fxtod %f8,%f2 411 std %f6,[%l6+8] 412 413 fitod %f0,%f4 414 fitod %f1,%f6 415 ldx [%l4],%g2 C p16 416 ldx [%l4+8],%g1 C p0 417 fmuld %f2,%f4,%f4 418 sllx %g2,16,%g2 C align p16 419 add %i0,8,%i0 C res_ptr++ 420 fmuld %f2,%f6,%f6 421 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 422 srlx %g1,32,%l0 423 stw %g1,[%i0-8] 424 fdtox %f4,%f4 425 stw %l0,[%i0-4] 426.L2: fdtox %f6,%f6 427 std %f4,[%l4] 428 std %f6,[%l4+8] 429 430 ldx [%l6],%g2 C p16 431 ldx [%l6+8],%g1 C p0 432 sllx %g2,16,%g2 C align p16 433 add %i0,8,%i0 C res_ptr++ 434 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 435 srlx %g1,32,%l0 436 stw %g1,[%i0-8] 437 stw %l0,[%i0-4] 438 439.L1: ldx [%l4],%g2 C p16 440 ldx [%l4+8],%g1 C p0 441 sllx %g2,16,%g2 C align p16 442 add %i0,8,%i0 C res_ptr++ 443 add %g2,%g1,%g1 C add p16 to p0 (ADD1) 444 srlx %g1,32,%l0 445 stw %g1,[%i0-8] 446 stw %l0,[%i0-4] 447 448 ret 449 restore %g0,%g0,%o0 450 451EPILOGUE(mpn_sqr_diagonal) 452