mul_basecase.asm revision 1.1.1.1
1dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result in 2dnl a third limb vector. 3 4dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 5dnl 6dnl Copyright 2011 Free Software Foundation, Inc. 7dnl 8dnl This file is part of the GNU MP Library. 9dnl 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of the GNU Lesser General Public License as published 12dnl by the Free Software Foundation; either version 3 of the License, or (at 13dnl your option) any later version. 14dnl 15dnl The GNU MP Library is distributed in the hope that it will be useful, but 16dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 17dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 18dnl License for more details. 19dnl 20dnl You should have received a copy of the GNU Lesser General Public License 21dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 22 23include(`../config.m4') 24 25C TODO 26C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the 27C 4 large loops into one; we could use it for the outer loop branch. 28C * Optimise code outside of inner loops. 29C * Write combined addmul_1 feed-in a wind-down code, and use when iterating 30C outer each loop. ("Overlapping software pipelining") 31C * Postpone push of ebx until we know vn > 1. Perhaps use caller-saves regs 32C for inlined mul_1, allowing us to postpone all pushes. 33C * Perhaps write special code for vn <= un < M, for some small M. 34 35C void mpn_mul_basecase (mp_ptr wp, 36C mp_srcptr xp, mp_size_t xn, 37C mp_srcptr yp, mp_size_t yn); 38C 39 40define(`rp', `%edi') 41define(`up', `%esi') 42define(`un', `%ecx') 43define(`vp', `%ebp') 44define(`vn', `36(%esp)') 45 46 TEXT 47 ALIGN(16) 48PROLOGUE(mpn_mul_basecase) 49 push %edi 50 push %esi 51 push %ebx 52 push %ebp 53 mov 20(%esp), rp 54 mov 24(%esp), up 55 mov 28(%esp), un 56 mov 32(%esp), vp 57 58 movd (up), %mm0 59 movd (vp), %mm7 60 pmuludq %mm7, %mm0 61 pxor %mm6, %mm6 62 63 mov un, %eax 64 and $3, %eax 65 jz L(of0) 66 cmp $2, %eax 67 jc L(of1) 68 jz L(of2) 69 70C ================================================================ 71 jmp L(m3) 72 ALIGN(16) 73L(lm3): movd -4(up), %mm0 74 pmuludq %mm7, %mm0 75 psrlq $32, %mm6 76 lea 16(rp), rp 77 paddq %mm0, %mm6 78 movd (up), %mm0 79 pmuludq %mm7, %mm0 80 movd %mm6, -4(rp) 81 psrlq $32, %mm6 82L(m3): paddq %mm0, %mm6 83 movd 4(up), %mm0 84 pmuludq %mm7, %mm0 85 movd %mm6, (rp) 86 psrlq $32, %mm6 87 paddq %mm0, %mm6 88 movd 8(up), %mm0 89 pmuludq %mm7, %mm0 90 movd %mm6, 4(rp) 91 psrlq $32, %mm6 92 paddq %mm0, %mm6 93 sub $4, un 94 movd %mm6, 8(rp) 95 lea 16(up), up 96 ja L(lm3) 97 98 psrlq $32, %mm6 99 movd %mm6, 12(rp) 100 101 decl vn 102 jz L(done) 103 lea -8(rp), rp 104 105L(ol3): mov 28(%esp), un 106 neg un 107 lea 4(vp), vp 108 movd (vp), %mm7 C read next V limb 109 mov 24(%esp), up 110 lea 16(rp,un,4), rp 111 112 movd (up), %mm0 113 pmuludq %mm7, %mm0 114 sar $2, un 115 movd 4(up), %mm1 116 movd %mm0, %ebx 117 pmuludq %mm7, %mm1 118 lea -8(up), up 119 xor %edx, %edx C zero edx and CF 120 jmp L(a3) 121 122L(la3): movd 4(up), %mm1 123 adc $0, %edx 124 add %eax, 12(rp) 125 movd %mm0, %ebx 126 pmuludq %mm7, %mm1 127 lea 16(rp), rp 128 psrlq $32, %mm0 129 adc %edx, %ebx 130 movd %mm0, %edx 131 movd %mm1, %eax 132 movd 8(up), %mm0 133 pmuludq %mm7, %mm0 134 adc $0, %edx 135 add %ebx, (rp) 136 psrlq $32, %mm1 137 adc %edx, %eax 138 movd %mm1, %edx 139 movd %mm0, %ebx 140 movd 12(up), %mm1 141 pmuludq %mm7, %mm1 142 adc $0, %edx 143 add %eax, 4(rp) 144L(a3): psrlq $32, %mm0 145 adc %edx, %ebx 146 movd %mm0, %edx 147 movd %mm1, %eax 148 lea 16(up), up 149 movd (up), %mm0 150 adc $0, %edx 151 add %ebx, 8(rp) 152 psrlq $32, %mm1 153 adc %edx, %eax 154 movd %mm1, %edx 155 pmuludq %mm7, %mm0 156 inc un 157 jnz L(la3) 158 159 adc un, %edx C un is zero here 160 add %eax, 12(rp) 161 movd %mm0, %ebx 162 psrlq $32, %mm0 163 adc %edx, %ebx 164 movd %mm0, %eax 165 adc un, %eax 166 add %ebx, 16(rp) 167 adc un, %eax 168 mov %eax, 20(rp) 169 170 decl vn 171 jnz L(ol3) 172 jmp L(done) 173 174C ================================================================ 175 ALIGN(16) 176L(lm0): movd (up), %mm0 177 pmuludq %mm7, %mm0 178 psrlq $32, %mm6 179 lea 16(rp), rp 180L(of0): paddq %mm0, %mm6 181 movd 4(up), %mm0 182 pmuludq %mm7, %mm0 183 movd %mm6, (rp) 184 psrlq $32, %mm6 185 paddq %mm0, %mm6 186 movd 8(up), %mm0 187 pmuludq %mm7, %mm0 188 movd %mm6, 4(rp) 189 psrlq $32, %mm6 190 paddq %mm0, %mm6 191 movd 12(up), %mm0 192 pmuludq %mm7, %mm0 193 movd %mm6, 8(rp) 194 psrlq $32, %mm6 195 paddq %mm0, %mm6 196 sub $4, un 197 movd %mm6, 12(rp) 198 lea 16(up), up 199 ja L(lm0) 200 201 psrlq $32, %mm6 202 movd %mm6, 16(rp) 203 204 decl vn 205 jz L(done) 206 lea -4(rp), rp 207 208L(ol0): mov 28(%esp), un 209 neg un 210 lea 4(vp), vp 211 movd (vp), %mm7 C read next V limb 212 mov 24(%esp), up 213 lea 20(rp,un,4), rp 214 215 movd (up), %mm1 216 pmuludq %mm7, %mm1 217 sar $2, un 218 movd 4(up), %mm0 219 lea -4(up), up 220 movd %mm1, %eax 221 pmuludq %mm7, %mm0 222 xor %edx, %edx C zero edx and CF 223 jmp L(a0) 224 225L(la0): movd 4(up), %mm1 226 adc $0, %edx 227 add %eax, 12(rp) 228 movd %mm0, %ebx 229 pmuludq %mm7, %mm1 230 lea 16(rp), rp 231 psrlq $32, %mm0 232 adc %edx, %ebx 233 movd %mm0, %edx 234 movd %mm1, %eax 235 movd 8(up), %mm0 236 pmuludq %mm7, %mm0 237 adc $0, %edx 238 add %ebx, (rp) 239L(a0): psrlq $32, %mm1 240 adc %edx, %eax 241 movd %mm1, %edx 242 movd %mm0, %ebx 243 movd 12(up), %mm1 244 pmuludq %mm7, %mm1 245 adc $0, %edx 246 add %eax, 4(rp) 247 psrlq $32, %mm0 248 adc %edx, %ebx 249 movd %mm0, %edx 250 movd %mm1, %eax 251 lea 16(up), up 252 movd (up), %mm0 253 adc $0, %edx 254 add %ebx, 8(rp) 255 psrlq $32, %mm1 256 adc %edx, %eax 257 movd %mm1, %edx 258 pmuludq %mm7, %mm0 259 inc un 260 jnz L(la0) 261 262 adc un, %edx C un is zero here 263 add %eax, 12(rp) 264 movd %mm0, %ebx 265 psrlq $32, %mm0 266 adc %edx, %ebx 267 movd %mm0, %eax 268 adc un, %eax 269 add %ebx, 16(rp) 270 adc un, %eax 271 mov %eax, 20(rp) 272 273 decl vn 274 jnz L(ol0) 275 jmp L(done) 276 277C ================================================================ 278 ALIGN(16) 279L(lm1): movd -12(up), %mm0 280 pmuludq %mm7, %mm0 281 psrlq $32, %mm6 282 lea 16(rp), rp 283 paddq %mm0, %mm6 284 movd -8(up), %mm0 285 pmuludq %mm7, %mm0 286 movd %mm6, -12(rp) 287 psrlq $32, %mm6 288 paddq %mm0, %mm6 289 movd -4(up), %mm0 290 pmuludq %mm7, %mm0 291 movd %mm6, -8(rp) 292 psrlq $32, %mm6 293 paddq %mm0, %mm6 294 movd (up), %mm0 295 pmuludq %mm7, %mm0 296 movd %mm6, -4(rp) 297 psrlq $32, %mm6 298L(of1): paddq %mm0, %mm6 299 sub $4, un 300 movd %mm6, (rp) 301 lea 16(up), up 302 ja L(lm1) 303 304 psrlq $32, %mm6 305 movd %mm6, 4(rp) 306 307 decl vn 308 jz L(done) 309 lea -16(rp), rp 310 311L(ol1): mov 28(%esp), un 312 neg un 313 lea 4(vp), vp 314 movd (vp), %mm7 C read next V limb 315 mov 24(%esp), up 316 lea 24(rp,un,4), rp 317 318 movd (up), %mm0 319 pmuludq %mm7, %mm0 320 sar $2, un 321 movd %mm0, %ebx 322 movd 4(up), %mm1 323 pmuludq %mm7, %mm1 324 xor %edx, %edx C zero edx and CF 325 inc un 326 jmp L(a1) 327 328L(la1): movd 4(up), %mm1 329 adc $0, %edx 330 add %eax, 12(rp) 331 movd %mm0, %ebx 332 pmuludq %mm7, %mm1 333 lea 16(rp), rp 334L(a1): psrlq $32, %mm0 335 adc %edx, %ebx 336 movd %mm0, %edx 337 movd %mm1, %eax 338 movd 8(up), %mm0 339 pmuludq %mm7, %mm0 340 adc $0, %edx 341 add %ebx, (rp) 342 psrlq $32, %mm1 343 adc %edx, %eax 344 movd %mm1, %edx 345 movd %mm0, %ebx 346 movd 12(up), %mm1 347 pmuludq %mm7, %mm1 348 adc $0, %edx 349 add %eax, 4(rp) 350 psrlq $32, %mm0 351 adc %edx, %ebx 352 movd %mm0, %edx 353 movd %mm1, %eax 354 lea 16(up), up 355 movd (up), %mm0 356 adc $0, %edx 357 add %ebx, 8(rp) 358 psrlq $32, %mm1 359 adc %edx, %eax 360 movd %mm1, %edx 361 pmuludq %mm7, %mm0 362 inc un 363 jnz L(la1) 364 365 adc un, %edx C un is zero here 366 add %eax, 12(rp) 367 movd %mm0, %ebx 368 psrlq $32, %mm0 369 adc %edx, %ebx 370 movd %mm0, %eax 371 adc un, %eax 372 add %ebx, 16(rp) 373 adc un, %eax 374 mov %eax, 20(rp) 375 376 decl vn 377 jnz L(ol1) 378 jmp L(done) 379 380C ================================================================ 381 ALIGN(16) 382L(lm2): movd -8(up), %mm0 383 pmuludq %mm7, %mm0 384 psrlq $32, %mm6 385 lea 16(rp), rp 386 paddq %mm0, %mm6 387 movd -4(up), %mm0 388 pmuludq %mm7, %mm0 389 movd %mm6, -8(rp) 390 psrlq $32, %mm6 391 paddq %mm0, %mm6 392 movd (up), %mm0 393 pmuludq %mm7, %mm0 394 movd %mm6, -4(rp) 395 psrlq $32, %mm6 396L(of2): paddq %mm0, %mm6 397 movd 4(up), %mm0 398 pmuludq %mm7, %mm0 399 movd %mm6, (rp) 400 psrlq $32, %mm6 401 paddq %mm0, %mm6 402 sub $4, un 403 movd %mm6, 4(rp) 404 lea 16(up), up 405 ja L(lm2) 406 407 psrlq $32, %mm6 408 movd %mm6, 8(rp) 409 410 decl vn 411 jz L(done) 412 lea -12(rp), rp 413 414L(ol2): mov 28(%esp), un 415 neg un 416 lea 4(vp), vp 417 movd (vp), %mm7 C read next V limb 418 mov 24(%esp), up 419 lea 12(rp,un,4), rp 420 421 movd (up), %mm1 422 pmuludq %mm7, %mm1 423 sar $2, un 424 movd 4(up), %mm0 425 lea 4(up), up 426 movd %mm1, %eax 427 xor %edx, %edx C zero edx and CF 428 jmp L(lo2) 429 430L(la2): movd 4(up), %mm1 431 adc $0, %edx 432 add %eax, 12(rp) 433 movd %mm0, %ebx 434 pmuludq %mm7, %mm1 435 lea 16(rp), rp 436 psrlq $32, %mm0 437 adc %edx, %ebx 438 movd %mm0, %edx 439 movd %mm1, %eax 440 movd 8(up), %mm0 441 pmuludq %mm7, %mm0 442 adc $0, %edx 443 add %ebx, (rp) 444 psrlq $32, %mm1 445 adc %edx, %eax 446 movd %mm1, %edx 447 movd %mm0, %ebx 448 movd 12(up), %mm1 449 pmuludq %mm7, %mm1 450 adc $0, %edx 451 add %eax, 4(rp) 452 psrlq $32, %mm0 453 adc %edx, %ebx 454 movd %mm0, %edx 455 movd %mm1, %eax 456 lea 16(up), up 457 movd (up), %mm0 458 adc $0, %edx 459 add %ebx, 8(rp) 460L(lo2): psrlq $32, %mm1 461 adc %edx, %eax 462 movd %mm1, %edx 463 pmuludq %mm7, %mm0 464 inc un 465 jnz L(la2) 466 467 adc un, %edx C un is zero here 468 add %eax, 12(rp) 469 movd %mm0, %ebx 470 psrlq $32, %mm0 471 adc %edx, %ebx 472 movd %mm0, %eax 473 adc un, %eax 474 add %ebx, 16(rp) 475 adc un, %eax 476 mov %eax, 20(rp) 477 478 decl vn 479 jnz L(ol2) 480C jmp L(done) 481 482C ================================================================ 483L(done): 484 emms 485 pop %ebp 486 pop %ebx 487 pop %esi 488 pop %edi 489 ret 490EPILOGUE() 491