1dnl IA-64 mpn_bdiv_dbm1. 2 3dnl Copyright 2008, 2009 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C Itanium: 4 24C Itanium 2: 2 25 26C TODO 27C * Optimize feed-in and wind-down code, both for speed and code size. 28 29C INPUT PARAMETERS 30define(`rp', `r32') 31define(`up', `r33') 32define(`n', `r34') 33define(`bd', `r35') 34 35ASM_START() 36PROLOGUE(mpn_bdiv_dbm1c) 37 .prologue 38 .save ar.lc, r2 39 .body 40 41ifdef(`HAVE_ABI_32', 42` addp4 rp = 0, rp C M I 43 addp4 up = 0, up C M I 44 zxt4 n = n C I 45 ;; 46') 47{.mmb 48 mov r15 = r36 C M I 49 ldf8 f9 = [up], 8 C M 50 nop.b 0 C B 51} 52.Lcommon: 53{.mii 54 adds r16 = -1, n C M I 55 mov r2 = ar.lc C I0 56 and r14 = 3, n C M I 57 ;; 58} 59{.mii 60 setf.sig f6 = bd C M2 M3 61 shr.u r31 = r16, 2 C I0 62 cmp.eq p10, p0 = 0, r14 C M I 63} 64{.mii 65 nop.m 0 C M 66 cmp.eq p11, p0 = 2, r14 C M I 67 cmp.eq p12, p0 = 3, r14 C M I 68 ;; 69} 70{.mii 71 cmp.ne p6, p7 = r0, r0 C M I 72 mov.i ar.lc = r31 C I0 73 cmp.ne p8, p9 = r0, r0 C M I 74} 75{.bbb 76 (p10) br.dptk .Lb00 C B 77 (p11) br.dptk .Lb10 C B 78 (p12) br.dptk .Lb11 C B 79 ;; 80} 81 82.Lb01: br.cloop.dptk .grt1 83 ;; 84 xma.l f38 = f9, f6, f0 85 xma.hu f39 = f9, f6, f0 86 ;; 87 getf.sig r26 = f38 88 getf.sig r27 = f39 89 br .Lcj1 90 91.grt1: ldf8 f10 = [r33], 8 92 ;; 93 ldf8 f11 = [r33], 8 94 ;; 95 ldf8 f12 = [r33], 8 96 ;; 97 xma.l f38 = f9, f6, f0 98 xma.hu f39 = f9, f6, f0 99 ;; 100 ldf8 f13 = [r33], 8 101 ;; 102 xma.l f32 = f10, f6, f0 103 xma.hu f33 = f10, f6, f0 104 br.cloop.dptk .grt5 105 106 ;; 107 getf.sig r26 = f38 108 xma.l f34 = f11, f6, f0 109 xma.hu f35 = f11, f6, f0 110 ;; 111 getf.sig r27 = f39 112 ;; 113 getf.sig r20 = f32 114 xma.l f36 = f12, f6, f0 115 xma.hu f37 = f12, f6, f0 116 ;; 117 getf.sig r21 = f33 118 ;; 119 getf.sig r22 = f34 120 xma.l f38 = f13, f6, f0 121 xma.hu f39 = f13, f6, f0 122 br .Lcj5 123 124.grt5: ldf8 f10 = [r33], 8 125 ;; 126 getf.sig r26 = f38 127 xma.l f34 = f11, f6, f0 128 xma.hu f35 = f11, f6, f0 129 ;; 130 getf.sig r27 = f39 131 ldf8 f11 = [r33], 8 132 ;; 133 getf.sig r20 = f32 134 xma.l f36 = f12, f6, f0 135 xma.hu f37 = f12, f6, f0 136 ;; 137 getf.sig r21 = f33 138 ldf8 f12 = [r33], 8 139 ;; 140 getf.sig r22 = f34 141 xma.l f38 = f13, f6, f0 142 xma.hu f39 = f13, f6, f0 143 br .LL01 144 145.Lb10: ldf8 f13 = [r33], 8 146 br.cloop.dptk .grt2 147 ;; 148 149 xma.l f36 = f9, f6, f0 150 xma.hu f37 = f9, f6, f0 151 ;; 152 xma.l f38 = f13, f6, f0 153 xma.hu f39 = f13, f6, f0 154 ;; 155 getf.sig r24 = f36 156 ;; 157 getf.sig r25 = f37 158 ;; 159 getf.sig r26 = f38 160 ;; 161 getf.sig r27 = f39 162 br .Lcj2 163 164.grt2: ldf8 f10 = [r33], 8 165 ;; 166 ldf8 f11 = [r33], 8 167 ;; 168 xma.l f36 = f9, f6, f0 169 xma.hu f37 = f9, f6, f0 170 ;; 171 ldf8 f12 = [r33], 8 172 ;; 173 xma.l f38 = f13, f6, f0 174 xma.hu f39 = f13, f6, f0 175 ;; 176 ldf8 f13 = [r33], 8 177 ;; 178 getf.sig r24 = f36 179 xma.l f32 = f10, f6, f0 180 xma.hu f33 = f10, f6, f0 181 br.cloop.dptk .grt6 182 183 getf.sig r25 = f37 184 ;; 185 getf.sig r26 = f38 186 xma.l f34 = f11, f6, f0 187 xma.hu f35 = f11, f6, f0 188 ;; 189 getf.sig r27 = f39 190 ;; 191 getf.sig r20 = f32 192 xma.l f36 = f12, f6, f0 193 xma.hu f37 = f12, f6, f0 194 br .Lcj6 195 196.grt6: getf.sig r25 = f37 197 ldf8 f10 = [r33], 8 198 ;; 199 getf.sig r26 = f38 200 xma.l f34 = f11, f6, f0 201 xma.hu f35 = f11, f6, f0 202 ;; 203 getf.sig r27 = f39 204 ldf8 f11 = [r33], 8 205 ;; 206 getf.sig r20 = f32 207 xma.l f36 = f12, f6, f0 208 xma.hu f37 = f12, f6, f0 209 br .LL10 210 211 212.Lb11: ldf8 f12 = [r33], 8 213 ;; 214 ldf8 f13 = [r33], 8 215 br.cloop.dptk .grt3 216 ;; 217 218 xma.l f34 = f9, f6, f0 219 xma.hu f35 = f9, f6, f0 220 ;; 221 xma.l f36 = f12, f6, f0 222 xma.hu f37 = f12, f6, f0 223 ;; 224 getf.sig r22 = f34 225 xma.l f38 = f13, f6, f0 226 xma.hu f39 = f13, f6, f0 227 ;; 228 getf.sig r23 = f35 229 ;; 230 getf.sig r24 = f36 231 ;; 232 getf.sig r25 = f37 233 ;; 234 getf.sig r26 = f38 235 br .Lcj3 236 237.grt3: ldf8 f10 = [r33], 8 238 ;; 239 xma.l f34 = f9, f6, f0 240 xma.hu f35 = f9, f6, f0 241 ;; 242 ldf8 f11 = [r33], 8 243 ;; 244 xma.l f36 = f12, f6, f0 245 xma.hu f37 = f12, f6, f0 246 ;; 247 ldf8 f12 = [r33], 8 248 ;; 249 getf.sig r22 = f34 250 xma.l f38 = f13, f6, f0 251 xma.hu f39 = f13, f6, f0 252 ;; 253 getf.sig r23 = f35 254 ldf8 f13 = [r33], 8 255 ;; 256 getf.sig r24 = f36 257 xma.l f32 = f10, f6, f0 258 xma.hu f33 = f10, f6, f0 259 br.cloop.dptk .grt7 260 261 getf.sig r25 = f37 262 ;; 263 getf.sig r26 = f38 264 xma.l f34 = f11, f6, f0 265 xma.hu f35 = f11, f6, f0 266 br .Lcj7 267 268.grt7: getf.sig r25 = f37 269 ldf8 f10 = [r33], 8 270 ;; 271 getf.sig r26 = f38 272 xma.l f34 = f11, f6, f0 273 xma.hu f35 = f11, f6, f0 274 br .LL11 275 276 277.Lb00: ldf8 f11 = [r33], 8 278 ;; 279 ldf8 f12 = [r33], 8 280 ;; 281 ldf8 f13 = [r33], 8 282 br.cloop.dptk .grt4 283 ;; 284 285 xma.l f32 = f9, f6, f0 286 xma.hu f33 = f9, f6, f0 287 ;; 288 xma.l f34 = f11, f6, f0 289 xma.hu f35 = f11, f6, f0 290 ;; 291 getf.sig r20 = f32 292 xma.l f36 = f12, f6, f0 293 xma.hu f37 = f12, f6, f0 294 ;; 295 getf.sig r21 = f33 296 ;; 297 getf.sig r22 = f34 298 xma.l f38 = f13, f6, f0 299 xma.hu f39 = f13, f6, f0 300 ;; 301 getf.sig r23 = f35 302 ;; 303 getf.sig r24 = f36 304 br .Lcj4 305 306.grt4: xma.l f32 = f9, f6, f0 307 xma.hu f33 = f9, f6, f0 308 ;; 309 ldf8 f10 = [r33], 8 310 ;; 311 xma.l f34 = f11, f6, f0 312 xma.hu f35 = f11, f6, f0 313 ;; 314 ldf8 f11 = [r33], 8 315 ;; 316 getf.sig r20 = f32 317 xma.l f36 = f12, f6, f0 318 xma.hu f37 = f12, f6, f0 319 ;; 320 getf.sig r21 = f33 321 ldf8 f12 = [r33], 8 322 ;; 323 getf.sig r22 = f34 324 xma.l f38 = f13, f6, f0 325 xma.hu f39 = f13, f6, f0 326 ;; 327 getf.sig r23 = f35 328 ldf8 f13 = [r33], 8 329 ;; 330 getf.sig r24 = f36 331 xma.l f32 = f10, f6, f0 332 xma.hu f33 = f10, f6, f0 333 br.cloop.dptk .LL00 334 br .Lcj8 335 336C *** MAIN LOOP START *** 337 ALIGN(32) 338.Ltop: 339 .pred.rel "mutex",p6,p7 340C .mfi 341 getf.sig r24 = f36 342 xma.l f32 = f10, f6, f0 343 (p6) sub r15 = r19, r27, 1 344C .mfi 345 st8 [r32] = r19, 8 346 xma.hu f33 = f10, f6, f0 347 (p7) sub r15 = r19, r27 348 ;; 349.LL00: 350C .mfi 351 getf.sig r25 = f37 352 nop.f 0 353 cmp.ltu p6, p7 = r15, r20 354C .mib 355 ldf8 f10 = [r33], 8 356 sub r16 = r15, r20 357 nop.b 0 358 ;; 359 360C .mfi 361 getf.sig r26 = f38 362 xma.l f34 = f11, f6, f0 363 (p6) sub r15 = r16, r21, 1 364C .mfi 365 st8 [r32] = r16, 8 366 xma.hu f35 = f11, f6, f0 367 (p7) sub r15 = r16, r21 368 ;; 369.LL11: 370C .mfi 371 getf.sig r27 = f39 372 nop.f 0 373 cmp.ltu p6, p7 = r15, r22 374C .mib 375 ldf8 f11 = [r33], 8 376 sub r17 = r15, r22 377 nop.b 0 378 ;; 379 380C .mfi 381 getf.sig r20 = f32 382 xma.l f36 = f12, f6, f0 383 (p6) sub r15 = r17, r23, 1 384C .mfi 385 st8 [r32] = r17, 8 386 xma.hu f37 = f12, f6, f0 387 (p7) sub r15 = r17, r23 388 ;; 389.LL10: 390C .mfi 391 getf.sig r21 = f33 392 nop.f 0 393 cmp.ltu p6, p7 = r15, r24 394C .mib 395 ldf8 f12 = [r33], 8 396 sub r18 = r15, r24 397 nop.b 0 398 ;; 399 400C .mfi 401 getf.sig r22 = f34 402 xma.l f38 = f13, f6, f0 403 (p6) sub r15 = r18, r25, 1 404C .mfi 405 st8 [r32] = r18, 8 406 xma.hu f39 = f13, f6, f0 407 (p7) sub r15 = r18, r25 408 ;; 409.LL01: 410C .mfi 411 getf.sig r23 = f35 412 nop.f 0 413 cmp.ltu p6, p7 = r15, r26 414C .mib 415 ldf8 f13 = [r33], 8 416 sub r19 = r15, r26 417 br.cloop.sptk.few .Ltop 418C *** MAIN LOOP END *** 419 ;; 420 421 getf.sig r24 = f36 422 xma.l f32 = f10, f6, f0 423 (p6) sub r15 = r19, r27, 1 424 st8 [r32] = r19, 8 425 xma.hu f33 = f10, f6, f0 426 (p7) sub r15 = r19, r27 427 ;; 428.Lcj8: getf.sig r25 = f37 429 cmp.ltu p6, p7 = r15, r20 430 sub r16 = r15, r20 431 ;; 432 getf.sig r26 = f38 433 xma.l f34 = f11, f6, f0 434 (p6) sub r15 = r16, r21, 1 435 st8 [r32] = r16, 8 436 xma.hu f35 = f11, f6, f0 437 (p7) sub r15 = r16, r21 438 ;; 439.Lcj7: getf.sig r27 = f39 440 cmp.ltu p6, p7 = r15, r22 441 sub r17 = r15, r22 442 ;; 443 getf.sig r20 = f32 444 xma.l f36 = f12, f6, f0 445 (p6) sub r15 = r17, r23, 1 446 st8 [r32] = r17, 8 447 xma.hu f37 = f12, f6, f0 448 (p7) sub r15 = r17, r23 449 ;; 450.Lcj6: getf.sig r21 = f33 451 cmp.ltu p6, p7 = r15, r24 452 sub r18 = r15, r24 453 ;; 454 getf.sig r22 = f34 455 xma.l f38 = f13, f6, f0 456 (p6) sub r15 = r18, r25, 1 457 st8 [r32] = r18, 8 458 xma.hu f39 = f13, f6, f0 459 (p7) sub r15 = r18, r25 460 ;; 461.Lcj5: getf.sig r23 = f35 462 cmp.ltu p6, p7 = r15, r26 463 sub r19 = r15, r26 464 ;; 465 getf.sig r24 = f36 466 (p6) sub r15 = r19, r27, 1 467 st8 [r32] = r19, 8 468 (p7) sub r15 = r19, r27 469 ;; 470.Lcj4: getf.sig r25 = f37 471 cmp.ltu p6, p7 = r15, r20 472 sub r16 = r15, r20 473 ;; 474 getf.sig r26 = f38 475 (p6) sub r15 = r16, r21, 1 476 st8 [r32] = r16, 8 477 (p7) sub r15 = r16, r21 478 ;; 479.Lcj3: getf.sig r27 = f39 480 cmp.ltu p6, p7 = r15, r22 481 sub r17 = r15, r22 482 ;; 483 (p6) sub r15 = r17, r23, 1 484 st8 [r32] = r17, 8 485 (p7) sub r15 = r17, r23 486 ;; 487.Lcj2: cmp.ltu p6, p7 = r15, r24 488 sub r18 = r15, r24 489 ;; 490 (p6) sub r15 = r18, r25, 1 491 st8 [r32] = r18, 8 492 (p7) sub r15 = r18, r25 493 ;; 494.Lcj1: cmp.ltu p6, p7 = r15, r26 495 sub r19 = r15, r26 496 ;; 497 (p6) sub r8 = r19, r27, 1 498 st8 [r32] = r19 499 (p7) sub r8 = r19, r27 500 mov ar.lc = r2 501 br.ret.sptk.many b0 502EPILOGUE() 503ASM_END() 504