1dnl IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an 2dnl unnormalized limb. 3 4dnl Copyright 2002, 2004, 2005 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C Itanium: 40-42 26C Itanium 2: 29-30 27 28C This was generated by gcc, then the loops were optimized. The preinv entry 29C point was shoehorned into the file. Lots of things outside the loops could 30C be streamlined. It would probably be a good idea to merge the loops for 31C normalized and unnormalized divisor, since the shifting stuff is done for 32C free in parallel with other operations. It would even be possible to merge 33C all loops, if the ld8 were made conditional. 34 35C TODO 36C * Consider delaying inversion for normalized mpn_divrem_1 entry till after 37C computing leading limb. 38C * Inline and interleave limb inversion code with loop setup code. 39 40ASM_START() 41 42C HP's assembler requires these declarations for importing mpn_invert_limb 43 .global mpn_invert_limb 44 .type mpn_invert_limb,@function 45 46C INPUT PARAMETERS 47C rp = r32 48C qxn = r33 49C up = r34 50C n = r35 51C vl = r36 52C vlinv = r37 (preinv only) 53C cnt = r38 (preinv only) 54 55PROLOGUE(mpn_preinv_divrem_1) 56 .prologue 57 .save ar.pfs, r42 58 alloc r42 = ar.pfs, 7, 8, 1, 0 59 .save ar.lc, r44 60 mov r44 = ar.lc 61 .save rp, r41 62 mov r41 = b0 63 .body 64ifdef(`HAVE_ABI_32', 65` addp4 r32 = 0, r32 66 sxt4 r33 = r33 67 addp4 r34 = 0, r34 68 sxt4 r35 = r35 69 ;; 70') 71 mov r40 = r38 72 shladd r34 = r35, 3, r34 73 ;; 74 adds r34 = -8, r34 75 ;; 76 ld8 r39 = [r34], -8 77 ;; 78 79 add r15 = r35, r33 80 ;; 81 mov r8 = r37 82 shladd r32 = r15, 3, r32 C r32 = rp + n + qxn 83 cmp.le p8, p0 = 0, r36 84 ;; 85 adds r32 = -8, r32 C r32 = rp + n + qxn - 1 86 cmp.leu p6, p7 = r36, r39 87 (p8) br.cond.dpnt .Lpunnorm 88 ;; 89 90 (p6) addl r15 = 1, r0 91 (p7) mov r15 = r0 92 ;; 93 (p6) sub r38 = r39, r36 94 (p7) mov r38 = r39 95 st8 [r32] = r15, -8 96 adds r35 = -2, r35 C un -= 2 97 br .Lpn 98 99.Lpunnorm: 100 (p6) add r34 = 8, r34 101 mov r38 = 0 C r = 0 102 shl r36 = r36, r40 103 (p6) br.cond.dptk .Lpu 104 ;; 105 shl r38 = r39, r40 C r = ahigh << cnt 106 cmp.ne p8, p0 = 1, r35 107 st8 [r32] = r0, -8 108 adds r35 = -1, r35 C un-- 109 (p8) br.cond.dpnt .Lpu 110 111 mov r23 = 1 112 ;; 113 setf.sig f6 = r8 114 setf.sig f12 = r23 115 br .L435 116EPILOGUE() 117 118 119PROLOGUE(mpn_divrem_1) 120 .prologue 121 .save ar.pfs, r42 122 alloc r42 = ar.pfs, 5, 8, 1, 0 123 .save ar.lc, r44 124 mov r44 = ar.lc 125 .save rp, r41 126 mov r41 = b0 127 .body 128ifdef(`HAVE_ABI_32', 129` addp4 r32 = 0, r32 130 sxt4 r33 = r33 131 addp4 r34 = 0, r34 132 sxt4 r35 = r35 133 ;; 134') 135 mov r38 = r0 136 add r15 = r35, r33 137 ;; 138 cmp.ne p6, p7 = 0, r15 139 ;; 140 (p7) mov r8 = r0 141 (p7) br.cond.dpnt .Lret 142 shladd r14 = r15, 3, r32 C r14 = rp + n + qxn 143 cmp.le p6, p7 = 0, r36 144 ;; 145 adds r32 = -8, r14 C r32 = rp + n + qxn - 1 146 (p6) br.cond.dpnt .Lunnorm 147 cmp.eq p6, p7 = 0, r35 148 (p6) br.cond.dpnt .L179 149 shladd r14 = r35, 3, r34 150 ;; 151 adds r14 = -8, r14 152 adds r35 = -1, r35 153 ;; 154 ld8 r38 = [r14] 155 ;; 156 cmp.leu p6, p7 = r36, r38 157 ;; 158 (p6) addl r15 = 1, r0 159 (p7) mov r15 = r0 160 ;; 161 st8 [r32] = r15, -8 162 (p6) sub r38 = r38, r36 163 164.L179: 165 mov r45 = r36 166 adds r35 = -1, r35 167 br.call.sptk.many b0 = mpn_invert_limb 168 ;; 169 shladd r34 = r35, 3, r34 170.Lpn: 171 mov r23 = 1 172 ;; 173 setf.sig f6 = r8 174 setf.sig f12 = r23 175 cmp.le p6, p7 = 0, r35 176 mov r40 = 0 177 (p7) br.cond.dpnt .L435 178 setf.sig f10 = r36 179 mov ar.lc = r35 180 setf.sig f7 = r38 181 ;; 182 sub r28 = -1, r36 183C Develop quotient limbs for normalized divisor 184.Loop1: C 00 C q=r18 nh=r38/f7 185 ld8 r20 = [r34], -8 186 xma.hu f11 = f7, f6, f0 187 ;; C 04 188 xma.l f8 = f11, f12, f7 C q = q + nh 189 ;; C 08 190 getf.sig r18 = f8 191 xma.hu f9 = f8, f10, f0 192 xma.l f8 = f8, f10, f0 193 ;; C 12 194 getf.sig r16 = f9 195 C 13 196 getf.sig r15 = f8 197 ;; C 18 198 cmp.ltu p6, p7 = r20, r15 199 sub r15 = r20, r15 200 sub r16 = r38, r16 201 ;; C 19 202 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? 203 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? 204 (p6) add r16 = -1, r16 205 (p0) cmp.ne.unc p6, p7 = r0, r0 206 ;; C 20 207 (p8) cmp.ltu p6, p7 = r15, r36 208 (p8) sub r15 = r15, r36 209 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 210 ;; C 21 211 .pred.rel "mutex",p6,p7 212 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? 213 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? 214 cmp.ltu p6, p7 = r15, r36 C speculative 215 sub r28 = r15, r36 C speculative, just for cmp 216 ;; C 22 217 (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed 218 (p8) mov r15 = r28 219 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 220 ;; C 23 221 (p6) setf.sig f7 = r15 222 (p7) sub r15 = r15, r36 223 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 224 ;; C 24 225 (p7) setf.sig f7 = r15 226 st8 [r32] = r18, -8 227 mov r38 = r15 228 br.cloop.dptk .Loop1 229 C 29/30 230 br.sptk .L435 231 ;; 232.Lunnorm: 233 mux1 r16 = r36, @rev 234 cmp.eq p6, p7 = 0, r35 235 (p6) br.cond.dpnt .L322 236 shladd r34 = r35, 3, r34 237 ;; 238 adds r34 = -8, r34 239 ;; 240 ld8 r39 = [r34] 241 ;; 242 cmp.leu p6, p7 = r36, r39 243 (p6) br.cond.dptk .L322 244 adds r34 = -8, r34 245 ;; 246 mov r38 = r39 247 ;; 248 cmp.ne p6, p7 = 1, r15 249 st8 [r32] = r0, -8 250 ;; 251 (p7) mov r8 = r38 252 (p7) br.cond.dpnt .Lret 253 adds r35 = -1, r35 254.L322: 255 sub r14 = r0, r16 256 ;; 257 or r14 = r16, r14 258 ;; 259 mov r16 = -8 260 czx1.l r14 = r14 261 ;; 262 shladd r16 = r14, 3, r16 263 ;; 264 shr.u r14 = r36, r16 265 ;; 266 cmp.geu p6, p7 = 15, r14 267 ;; 268 (p7) shr.u r14 = r14, 4 269 (p7) adds r16 = 4, r16 270 ;; 271 cmp.geu p6, p7 = 3, r14 272 ;; 273 (p7) shr.u r14 = r14, 2 274 (p7) adds r16 = 2, r16 275 ;; 276 tbit.nz p6, p7 = r14, 1 277 ;; 278 .pred.rel "mutex",p6,p7 279 (p6) sub r40 = 62, r16 280 (p7) sub r40 = 63, r16 281 ;; 282 shl r45 = r36, r40 283 shl r36 = r36, r40 284 shl r38 = r38, r40 285 br.call.sptk.many b0 = mpn_invert_limb 286 ;; 287.Lpu: 288 mov r23 = 1 289 ;; 290 setf.sig f6 = r8 291 setf.sig f12 = r23 292 cmp.eq p6, p7 = 0, r35 293 (p6) br.cond.dpnt .L435 294 sub r16 = 64, r40 295 adds r35 = -2, r35 296 ;; 297 ld8 r39 = [r34], -8 298 cmp.le p6, p7 = 0, r35 299 ;; 300 shr.u r14 = r39, r16 301 ;; 302 or r38 = r14, r38 303 (p7) br.cond.dpnt .Lend3 304 ;; 305 mov r22 = r16 306 setf.sig f10 = r36 307 setf.sig f7 = r38 308 mov ar.lc = r35 309 ;; 310C Develop quotient limbs for unnormalized divisor 311.Loop3: 312 ld8 r14 = [r34], -8 313 xma.hu f11 = f7, f6, f0 314 ;; 315 xma.l f8 = f11, f12, f7 C q = q + nh 316 ;; 317 getf.sig r18 = f8 318 xma.hu f9 = f8, f10, f0 319 shl r20 = r39, r40 320 xma.l f8 = f8, f10, f0 321 shr.u r24 = r14, r22 322 ;; 323 getf.sig r16 = f9 324 getf.sig r15 = f8 325 or r20 = r24, r20 326 ;; 327 cmp.ltu p6, p7 = r20, r15 328 sub r15 = r20, r15 329 sub r16 = r38, r16 330 ;; 331 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? 332 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? 333 (p6) add r16 = -1, r16 334 (p0) cmp.ne.unc p6, p7 = r0, r0 335 ;; 336 (p8) cmp.ltu p6, p7 = r15, r36 337 (p8) sub r15 = r15, r36 338 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 339 ;; 340 .pred.rel "mutex",p6,p7 341 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? 342 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? 343 cmp.ltu p6, p7 = r15, r36 C speculative 344 sub r28 = r15, r36 C speculative, just for cmp 345 ;; 346 (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed 347 (p8) mov r15 = r28 348 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 349 ;; 350 (p6) setf.sig f7 = r15 351 (p7) sub r15 = r15, r36 352 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 353 ;; 354 (p7) setf.sig f7 = r15 355 st8 [r32] = r18, -8 356 mov r39 = r14 357 mov r38 = r15 358 br.cloop.dptk .Loop3 359 ;; 360.Lend3: 361 setf.sig f10 = r36 362 setf.sig f7 = r38 363 ;; 364 xma.hu f11 = f7, f6, f0 365 ;; 366 xma.l f8 = f11, f12, f7 C q = q + nh 367 ;; 368 getf.sig r18 = f8 369 xma.hu f9 = f8, f10, f0 370 shl r20 = r39, r40 371 xma.l f8 = f8, f10, f0 372 ;; 373 getf.sig r16 = f9 374 getf.sig r15 = f8 375 ;; 376 cmp.ltu p6, p7 = r20, r15 377 sub r15 = r20, r15 378 sub r16 = r38, r16 379 ;; 380 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? 381 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? 382 (p6) add r16 = -1, r16 383 (p0) cmp.ne.unc p6, p7 = r0, r0 384 ;; 385 (p8) cmp.ltu p6, p7 = r15, r36 386 (p8) sub r15 = r15, r36 387 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 388 ;; 389 .pred.rel "mutex",p6,p7 390 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? 391 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? 392 ;; 393 (p8) sub r15 = r15, r36 394 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 395 ;; 396 cmp.ltu p6, p7 = r15, r36 397 ;; 398 (p7) sub r15 = r15, r36 399 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 400 ;; 401 st8 [r32] = r18, -8 402 mov r38 = r15 403.L435: 404 adds r35 = -1, r33 405 cmp.le p6, p7 = 1, r33 406 (p7) br.cond.dpnt .Lend4 407 ;; 408 setf.sig f7 = r38 409 setf.sig f10 = r36 410 mov ar.lc = r35 411 ;; 412.Loop4: 413 xma.hu f11 = f7, f6, f0 414 ;; 415 xma.l f8 = f11, f12, f7 C q = q + nh 416 ;; 417 getf.sig r18 = f8 418 xma.hu f9 = f8, f10, f0 419 xma.l f8 = f8, f10, f0 420 ;; 421 getf.sig r16 = f9 422 getf.sig r15 = f8 423 ;; 424 cmp.ltu p6, p7 = 0, r15 425 sub r15 = 0, r15 426 sub r16 = r38, r16 427 ;; 428 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? 429 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? 430 (p6) add r16 = -1, r16 431 (p0) cmp.ne.unc p6, p7 = r0, r0 432 ;; 433 (p8) cmp.ltu p6, p7 = r15, r36 434 (p8) sub r15 = r15, r36 435 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 436 ;; 437 .pred.rel "mutex",p6,p7 438 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? 439 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? 440 cmp.ltu p6, p7 = r15, r36 C speculative 441 sub r28 = r15, r36 C speculative, just for cmp 442 ;; 443 (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed 444 (p8) mov r15 = r28 445 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 446 ;; 447 (p6) setf.sig f7 = r15 448 (p7) sub r15 = r15, r36 449 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 450 ;; 451 (p7) setf.sig f7 = r15 452 st8 [r32] = r18, -8 453 mov r38 = r15 454 br.cloop.dptk .Loop4 455 ;; 456.Lend4: 457 shr.u r8 = r38, r40 458.Lret: 459 mov ar.pfs = r42 460 mov ar.lc = r44 461 mov b0 = r41 462 br.ret.sptk.many b0 463EPILOGUE() 464ASM_END() 465