1dnl mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). 2 3dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C TODO: 23C * Improve ad-hoc outer loop code and register handling. Some feed-in 24C scheduling could improve things by several cycles per outer iteration. 25C * In Lam3...Lam1 code for, keep accumulation operands in registers, without 26C storing intermediates to rp. 27C * We might want to keep 32 in a free mm register, since the register form is 28C 3 bytes and the immediate form is 4 bytes. About 80 bytes to save. 29C * Look into different loop alignment, we now expand the code about 50 bytes 30C with possibly needless alignment. 31C * Use OSP, should solve feed-in latency problems. 32C * Address relative slowness for un<=3 for Pentium M. The old code is there 33C considerably faster. (1:20/14, 2:34:32, 3:66/57) 34 35C INPUT PARAMETERS 36C rp sp + 4 37C up sp + 8 38C un sp + 12 39 40 TEXT 41 ALIGN(16) 42PROLOGUE(mpn_sqr_basecase) 43 mov 4(%esp), %edx C rp 44 mov 8(%esp), %eax C up 45 mov 12(%esp), %ecx C un 46 47 cmp $2, %ecx 48 jc L(un1) 49 jz L(un2) 50 cmp $4, %ecx 51 jc L(un3) 52 jz L(un4) 53 jmp L(big) 54 55L(un1): mov (%eax), %eax 56 mov %edx, %ecx 57 mul %eax 58 mov %eax, (%ecx) 59 mov %edx, 4(%ecx) 60 ret 61L(un2): movd (%eax), %mm0 C un=2 62 movd (%eax), %mm2 C un=2 63 movd 4(%eax), %mm1 C un=2 64 pmuludq %mm0, %mm0 C 64b weight 0 un=2 65 pmuludq %mm1, %mm2 C 64b weight 32 un=2 66 pmuludq %mm1, %mm1 C 64b weight 64 un=2 67 movd %mm0, (%edx) C un=2 68 psrlq $32, %mm0 C 32b weight 32 un=2 69 pcmpeqd %mm7, %mm7 C un=2 70 psrlq $33, %mm7 C 0x000000007FFFFFFF un=2 71 pand %mm2, %mm7 C 31b weight 32 un=2 72 psrlq $31, %mm2 C 33b weight 65 un=2 73 psllq $1, %mm7 C 31b weight 33 un=2 74 paddq %mm7, %mm0 C un=2 75 movd %mm0, 4(%edx) C un=2 76 psrlq $32, %mm0 C un=2 77 paddq %mm2, %mm1 C un=2 78 paddq %mm0, %mm1 C un=2 79 movd %mm1, 8(%edx) C un=2 80 psrlq $32, %mm1 C un=2 81 movd %mm1, 12(%edx) C un=2 82 emms 83 ret 84L(un3): movd (%eax), %mm7 C un=3 85 movd 4(%eax), %mm6 C un=3 86 pmuludq %mm7, %mm6 C un=3 87 movd 8(%eax), %mm2 C un=3 88 pmuludq %mm7, %mm2 C un=3 89 movd %mm6, 4(%edx) C un=3 90 psrlq $32, %mm6 C un=3 91 paddq %mm2, %mm6 C un=3 92 movd %mm6, 8(%edx) C un=3 93 psrlq $32, %mm6 C un=3 94 movd %mm6, 12(%edx) C un=3 95 lea 4(%edx), %edx C un=3 96 lea 4(%eax), %eax C un=3 97 jmp L(am1) 98L(un4): movd (%eax), %mm7 C un=4 99 movd 4(%eax), %mm6 C un=4 100 pmuludq %mm7, %mm6 C un=4 101 movd 8(%eax), %mm0 C un=4 102 pmuludq %mm7, %mm0 C un=4 103 movd 12(%eax), %mm1 C un=4 104 pmuludq %mm7, %mm1 C un=4 105 movd %mm6, 4(%edx) C un=4 106 psrlq $32, %mm6 C un=4 107 paddq %mm0, %mm6 C un=4 108 movd %mm6, 8(%edx) C un=4 109 psrlq $32, %mm6 C un=4 110 paddq %mm1, %mm6 C un=4 111 movd %mm6, 12(%edx) C un=4 112 psrlq $32, %mm6 C un=4 113 movd %mm6, 16(%edx) C un=4 114 lea 4(%edx), %edx C un=4 115 lea 4(%eax), %eax C un=4 116 jmp L(am2) 117 118L(big): push %esi 119 push %ebx 120 push %edi 121 pxor %mm6, %mm6 122 movd (%eax), %mm7 C 123 lea 4(%eax), %esi C init up, up++ 124 lea 4(%eax), %eax C up2++ FIXME: should fix offsets 125 lea 4(%edx), %edi C init rp, rp++ 126 lea 4(%edx), %edx C rp2++ 127 lea -4(%ecx), %ebx C loop count 128 and $3, %ecx 129 jz L(3m) 130 cmp $2, %ecx 131 ja L(2m) 132 jb L(0m) 133 134L(1m): 135 movd (%eax), %mm4 C m 1 136 lea (%ebx), %ecx C inner loop count m 1 137 pmuludq %mm7, %mm4 C m 1 138 movd 4(%eax), %mm3 C m 1 139 pmuludq %mm7, %mm3 C m 1 140 movd 8(%eax), %mm0 C m 1 141 jmp L(m01) C m 1 142 ALIGN(16) C m 1 143L(lpm1): 144 pmuludq %mm7, %mm4 C m 1 145 paddq %mm0, %mm6 C m 1 146 movd 4(%eax), %mm3 C m 1 147 movd %mm6, -8(%edx) C m 1 148 psrlq $32, %mm6 C m 1 149 pmuludq %mm7, %mm3 C m 1 150 paddq %mm1, %mm6 C m 1 151 movd 8(%eax), %mm0 C m 1 152 movd %mm6, -4(%edx) C m 1 153 psrlq $32, %mm6 C m 1 154L(m01): pmuludq %mm7, %mm0 C m 1 155 paddq %mm4, %mm6 C m 1 156 movd 12(%eax), %mm1 C m 1 157 movd %mm6, (%edx) C m 1 158 psrlq $32, %mm6 C m 1 159 pmuludq %mm7, %mm1 C m 1 160 paddq %mm3, %mm6 C m 1 161 movd 16(%eax), %mm4 C m 1 162 movd %mm6, 4(%edx) C m 1 163 psrlq $32, %mm6 C m 1 164 lea 16(%eax), %eax C m 1 165 lea 16(%edx), %edx C m 1 166 sub $4, %ecx C m 1 167 ja L(lpm1) C m 1 168 pmuludq %mm7, %mm4 C m 1 169 paddq %mm0, %mm6 C m 1 170 movd %mm6, -8(%edx) C m 1 171 psrlq $32, %mm6 C m 1 172 paddq %mm1, %mm6 C m 1 173 jmp L(0) 174 175L(2m): 176 movd (%eax), %mm1 C m 2 177 lea (%ebx), %ecx C inner loop count m 2 178 pmuludq %mm7, %mm1 C m 2 179 movd 4(%eax), %mm4 C m 2 180 pmuludq %mm7, %mm4 C m 2 181 movd 8(%eax), %mm3 C m 2 182 jmp L(m10) C m 2 183 ALIGN(16) C m 2 184L(lpm2): 185 pmuludq %mm7, %mm4 C m 2 186 paddq %mm0, %mm6 C m 2 187 movd 8(%eax), %mm3 C m 2 188 movd %mm6, -4(%edx) C m 2 189 psrlq $32, %mm6 C m 2 190L(m10): pmuludq %mm7, %mm3 C m 2 191 paddq %mm1, %mm6 C m 2 192 movd 12(%eax), %mm0 C m 2 193 movd %mm6, (%edx) C m 2 194 psrlq $32, %mm6 C m 2 195 pmuludq %mm7, %mm0 C m 2 196 paddq %mm4, %mm6 C m 2 197 movd 16(%eax), %mm1 C m 2 198 movd %mm6, 4(%edx) C m 2 199 psrlq $32, %mm6 C m 2 200 pmuludq %mm7, %mm1 C m 2 201 paddq %mm3, %mm6 C m 2 202 movd 20(%eax), %mm4 C m 2 203 movd %mm6, 8(%edx) C m 2 204 psrlq $32, %mm6 C m 2 205 lea 16(%eax), %eax C m 2 206 lea 16(%edx), %edx C m 2 207 sub $4, %ecx C m 2 208 ja L(lpm2) C m 2 209 pmuludq %mm7, %mm4 C m 2 210 paddq %mm0, %mm6 C m 2 211 movd %mm6, -4(%edx) C m 2 212 psrlq $32, %mm6 C m 2 213 paddq %mm1, %mm6 C m 2 214 jmp L(1) 215 216L(3m): 217 movd (%eax), %mm0 C m 3 218 lea (%ebx), %ecx C inner loop count m 3 219 pmuludq %mm7, %mm0 C m 3 220 movd 4(%eax), %mm1 C m 3 221 pmuludq %mm7, %mm1 C m 3 222 movd 8(%eax), %mm4 C m 3 223 jmp L(lpm3) C m 3 224 ALIGN(16) C m 3 225L(lpm3): 226 pmuludq %mm7, %mm4 C m 3 227 paddq %mm0, %mm6 C m 3 228 movd 12(%eax), %mm3 C m 3 229 movd %mm6, (%edx) C m 3 230 psrlq $32, %mm6 C m 3 231 pmuludq %mm7, %mm3 C m 3 232 paddq %mm1, %mm6 C m 3 233 movd 16(%eax), %mm0 C m 3 234 movd %mm6, 4(%edx) C m 3 235 psrlq $32, %mm6 C m 3 236 pmuludq %mm7, %mm0 C m 3 237 paddq %mm4, %mm6 C m 3 238 movd 20(%eax), %mm1 C m 3 239 movd %mm6, 8(%edx) C m 3 240 psrlq $32, %mm6 C m 3 241 pmuludq %mm7, %mm1 C m 3 242 paddq %mm3, %mm6 C m 3 243 movd 24(%eax), %mm4 C m 3 244 movd %mm6, 12(%edx) C m 3 245 psrlq $32, %mm6 C m 3 246 lea 16(%eax), %eax C m 3 247 lea 16(%edx), %edx C m 3 248 sub $4, %ecx C m 3 249 ja L(lpm3) C m 3 250 pmuludq %mm7, %mm4 C m 3 251 paddq %mm0, %mm6 C m 3 252 movd %mm6, (%edx) C m 3 253 psrlq $32, %mm6 C m 3 254 paddq %mm1, %mm6 C m 3 255 jmp L(2) 256 257L(0m): 258 movd (%eax), %mm3 C m 0 259 lea (%ebx), %ecx C inner loop count m 0 260 pmuludq %mm7, %mm3 C m 0 261 movd 4(%eax), %mm0 C m 0 262 pmuludq %mm7, %mm0 C m 0 263 movd 8(%eax), %mm1 C m 0 264 jmp L(m00) C m 0 265 ALIGN(16) C m 0 266L(lpm0): 267 pmuludq %mm7, %mm4 C m 0 268 paddq %mm0, %mm6 C m 0 269 movd (%eax), %mm3 C m 0 270 movd %mm6, -12(%edx) C m 0 271 psrlq $32, %mm6 C m 0 272 pmuludq %mm7, %mm3 C m 0 273 paddq %mm1, %mm6 C m 0 274 movd 4(%eax), %mm0 C m 0 275 movd %mm6, -8(%edx) C m 0 276 psrlq $32, %mm6 C m 0 277 pmuludq %mm7, %mm0 C m 0 278 paddq %mm4, %mm6 C m 0 279 movd 8(%eax), %mm1 C m 0 280 movd %mm6, -4(%edx) C m 0 281 psrlq $32, %mm6 C m 0 282L(m00): pmuludq %mm7, %mm1 C m 0 283 paddq %mm3, %mm6 C m 0 284 movd 12(%eax), %mm4 C m 0 285 movd %mm6, (%edx) C m 0 286 psrlq $32, %mm6 C m 0 287 lea 16(%eax), %eax C m 0 288 lea 16(%edx), %edx C m 0 289 sub $4, %ecx C m 0 290 ja L(lpm0) C m 0 291 pmuludq %mm7, %mm4 C m 0 292 paddq %mm0, %mm6 C m 0 293 movd %mm6, -12(%edx) C m 0 294 psrlq $32, %mm6 C m 0 295 paddq %mm1, %mm6 C m 0 296 jmp L(3) 297 298L(outer): 299 lea 8(%edi), %edi C rp += 2 300 movd (%esi), %mm7 C am 3 301 mov %edi, %edx C rp2 = rp am 3 302 lea 4(%esi), %esi C up++ am 3 303 lea (%esi), %eax C up2 = up am 3 304 movd (%eax), %mm0 C am 3 305 lea (%ebx), %ecx C inner loop count am 3 306 pxor %mm6, %mm6 C am 3 307 pmuludq %mm7, %mm0 C am 3 308 movd 4(%eax), %mm1 C am 3 309 movd (%edx), %mm4 C am 3 310 pmuludq %mm7, %mm1 C am 3 311 movd 8(%eax), %mm2 C am 3 312 paddq %mm0, %mm4 C am 3 313 movd 4(%edx), %mm5 C am 3 314 jmp L(lam3) C am 3 315 ALIGN(16) C am 3 316L(lam3): 317 pmuludq %mm7, %mm2 C am 3 318 paddq %mm4, %mm6 C am 3 319 movd 12(%eax), %mm3 C am 3 320 paddq %mm1, %mm5 C am 3 321 movd 8(%edx), %mm4 C am 3 322 movd %mm6, (%edx) C am 3 323 psrlq $32, %mm6 C am 3 324 pmuludq %mm7, %mm3 C am 3 325 paddq %mm5, %mm6 C am 3 326 movd 16(%eax), %mm0 C am 3 327 paddq %mm2, %mm4 C am 3 328 movd 12(%edx), %mm5 C am 3 329 movd %mm6, 4(%edx) C am 3 330 psrlq $32, %mm6 C am 3 331 pmuludq %mm7, %mm0 C am 3 332 paddq %mm4, %mm6 C am 3 333 movd 20(%eax), %mm1 C am 3 334 paddq %mm3, %mm5 C am 3 335 movd 16(%edx), %mm4 C am 3 336 movd %mm6, 8(%edx) C am 3 337 psrlq $32, %mm6 C am 3 338 pmuludq %mm7, %mm1 C am 3 339 paddq %mm5, %mm6 C am 3 340 movd 24(%eax), %mm2 C am 3 341 paddq %mm0, %mm4 C am 3 342 movd 20(%edx), %mm5 C am 3 343 movd %mm6, 12(%edx) C am 3 344 psrlq $32, %mm6 C am 3 345 lea 16(%eax), %eax C am 3 346 lea 16(%edx), %edx C am 3 347 sub $4, %ecx C am 3 348 ja L(lam3) C am 3 349 pmuludq %mm7, %mm2 C am 3 350 paddq %mm4, %mm6 C am 3 351 paddq %mm1, %mm5 C am 3 352 movd 8(%edx), %mm4 C am 3 353 movd %mm6, (%edx) C am 3 354 psrlq $32, %mm6 C am 3 355 paddq %mm5, %mm6 C am 3 356 paddq %mm2, %mm4 C am 3 357L(2): movd %mm6, 4(%edx) C am 3 358 psrlq $32, %mm6 C am 3 359 paddq %mm4, %mm6 C am 3 360 movd %mm6, 8(%edx) C am 3 361 psrlq $32, %mm6 C am 3 362 movd %mm6, 12(%edx) C am 3 363 364 lea 8(%edi), %edi C rp += 2 365 movd (%esi), %mm7 C am 2 366 mov %edi, %edx C rp2 = rp am 2 367 lea 4(%esi), %esi C up++ am 2 368 lea (%esi), %eax C up2 = up am 2 369 movd (%eax), %mm1 C am 2 370 lea (%ebx), %ecx C inner loop count am 2 371 pxor %mm6, %mm6 C am 2 372 pmuludq %mm7, %mm1 C am 2 373 movd 4(%eax), %mm2 C am 2 374 movd (%edx), %mm5 C am 2 375 pmuludq %mm7, %mm2 C am 2 376 movd 8(%eax), %mm3 C am 2 377 paddq %mm1, %mm5 C am 2 378 movd 4(%edx), %mm4 C am 2 379 jmp L(am10) C am 2 380 ALIGN(16) C am 2 381L(lam2): 382 pmuludq %mm7, %mm2 C am 2 383 paddq %mm4, %mm6 C am 2 384 movd 8(%eax), %mm3 C am 2 385 paddq %mm1, %mm5 C am 2 386 movd 4(%edx), %mm4 C am 2 387 movd %mm6, -4(%edx) C am 2 388 psrlq $32, %mm6 C am 2 389L(am10): 390 pmuludq %mm7, %mm3 C am 2 391 paddq %mm5, %mm6 C am 2 392 movd 12(%eax), %mm0 C am 2 393 paddq %mm2, %mm4 C am 2 394 movd 8(%edx), %mm5 C am 2 395 movd %mm6, (%edx) C am 2 396 psrlq $32, %mm6 C am 2 397 pmuludq %mm7, %mm0 C am 2 398 paddq %mm4, %mm6 C am 2 399 movd 16(%eax), %mm1 C am 2 400 paddq %mm3, %mm5 C am 2 401 movd 12(%edx), %mm4 C am 2 402 movd %mm6, 4(%edx) C am 2 403 psrlq $32, %mm6 C am 2 404 pmuludq %mm7, %mm1 C am 2 405 paddq %mm5, %mm6 C am 2 406 movd 20(%eax), %mm2 C am 2 407 paddq %mm0, %mm4 C am 2 408 movd 16(%edx), %mm5 C am 2 409 movd %mm6, 8(%edx) C am 2 410 psrlq $32, %mm6 C am 2 411 lea 16(%eax), %eax C am 2 412 lea 16(%edx), %edx C am 2 413 sub $4, %ecx C am 2 414 ja L(lam2) C am 2 415 pmuludq %mm7, %mm2 C am 2 416 paddq %mm4, %mm6 C am 2 417 paddq %mm1, %mm5 C am 2 418 movd 4(%edx), %mm4 C am 2 419 movd %mm6, -4(%edx) C am 2 420 psrlq $32, %mm6 C am 2 421 paddq %mm5, %mm6 C am 2 422 paddq %mm2, %mm4 C am 2 423L(1): movd %mm6, (%edx) C am 2 424 psrlq $32, %mm6 C am 2 425 paddq %mm4, %mm6 C am 2 426 movd %mm6, 4(%edx) C am 2 427 psrlq $32, %mm6 C am 2 428 movd %mm6, 8(%edx) C am 2 429 430 lea 8(%edi), %edi C rp += 2 431 movd (%esi), %mm7 C am 1 432 mov %edi, %edx C rp2 = rp am 1 433 lea 4(%esi), %esi C up++ am 1 434 lea (%esi), %eax C up2 = up am 1 435 movd (%eax), %mm2 C am 1 436 lea (%ebx), %ecx C inner loop count am 1 437 pxor %mm6, %mm6 C am 1 438 pmuludq %mm7, %mm2 C am 1 439 movd 4(%eax), %mm3 C am 1 440 movd (%edx), %mm4 C am 1 441 pmuludq %mm7, %mm3 C am 1 442 movd 8(%eax), %mm0 C am 1 443 paddq %mm2, %mm4 C am 1 444 movd 4(%edx), %mm5 C am 1 445 jmp L(am01) C am 1 446 ALIGN(16) C am 1 447L(lam1): 448 pmuludq %mm7, %mm2 C am 1 449 paddq %mm4, %mm6 C am 1 450 movd 4(%eax), %mm3 C am 1 451 paddq %mm1, %mm5 C am 1 452 movd (%edx), %mm4 C am 1 453 movd %mm6, -8(%edx) C am 1 454 psrlq $32, %mm6 C am 1 455 pmuludq %mm7, %mm3 C am 1 456 paddq %mm5, %mm6 C am 1 457 movd 8(%eax), %mm0 C am 1 458 paddq %mm2, %mm4 C am 1 459 movd 4(%edx), %mm5 C am 1 460 movd %mm6, -4(%edx) C am 1 461 psrlq $32, %mm6 C am 1 462L(am01): 463 pmuludq %mm7, %mm0 C am 1 464 paddq %mm4, %mm6 C am 1 465 movd 12(%eax), %mm1 C am 1 466 paddq %mm3, %mm5 C am 1 467 movd 8(%edx), %mm4 C am 1 468 movd %mm6, (%edx) C am 1 469 psrlq $32, %mm6 C am 1 470 pmuludq %mm7, %mm1 C am 1 471 paddq %mm5, %mm6 C am 1 472 movd 16(%eax), %mm2 C am 1 473 paddq %mm0, %mm4 C am 1 474 movd 12(%edx), %mm5 C am 1 475 movd %mm6, 4(%edx) C am 1 476 psrlq $32, %mm6 C am 1 477 lea 16(%eax), %eax C am 1 478 lea 16(%edx), %edx C am 1 479 sub $4, %ecx C am 1 480 ja L(lam1) C am 1 481 pmuludq %mm7, %mm2 C am 1 482 paddq %mm4, %mm6 C am 1 483 paddq %mm1, %mm5 C am 1 484 movd (%edx), %mm4 C am 1 485 movd %mm6, -8(%edx) C am 1 486 psrlq $32, %mm6 C am 1 487 paddq %mm5, %mm6 C am 1 488 paddq %mm2, %mm4 C am 1 489L(0): movd %mm6, -4(%edx) C am 1 490 psrlq $32, %mm6 C am 1 491 paddq %mm4, %mm6 C am 1 492 movd %mm6, (%edx) C am 1 493 psrlq $32, %mm6 C am 1 494 movd %mm6, 4(%edx) C am 1 495 496 lea 8(%edi), %edi C rp += 2 497 movd (%esi), %mm7 C am 0 498 mov %edi, %edx C rp2 = rp am 0 499 lea 4(%esi), %esi C up++ am 0 500 lea (%esi), %eax C up2 = up am 0 501 movd (%eax), %mm3 C am 0 502 lea (%ebx), %ecx C inner loop count am 0 503 pxor %mm6, %mm6 C am 0 504 pmuludq %mm7, %mm3 C am 0 505 movd 4(%eax), %mm0 C am 0 506 movd (%edx), %mm5 C am 0 507 pmuludq %mm7, %mm0 C am 0 508 movd 8(%eax), %mm1 C am 0 509 paddq %mm3, %mm5 C am 0 510 movd 4(%edx), %mm4 C am 0 511 jmp L(am00) C am 0 512 ALIGN(16) C am 0 513L(lam0): 514 pmuludq %mm7, %mm2 C am 0 515 paddq %mm4, %mm6 C am 0 516 movd (%eax), %mm3 C am 0 517 paddq %mm1, %mm5 C am 0 518 movd -4(%edx), %mm4 C am 0 519 movd %mm6, -12(%edx) C am 0 520 psrlq $32, %mm6 C am 0 521 pmuludq %mm7, %mm3 C am 0 522 paddq %mm5, %mm6 C am 0 523 movd 4(%eax), %mm0 C am 0 524 paddq %mm2, %mm4 C am 0 525 movd (%edx), %mm5 C am 0 526 movd %mm6, -8(%edx) C am 0 527 psrlq $32, %mm6 C am 0 528 pmuludq %mm7, %mm0 C am 0 529 paddq %mm4, %mm6 C am 0 530 movd 8(%eax), %mm1 C am 0 531 paddq %mm3, %mm5 C am 0 532 movd 4(%edx), %mm4 C am 0 533 movd %mm6, -4(%edx) C am 0 534 psrlq $32, %mm6 C am 0 535L(am00): 536 pmuludq %mm7, %mm1 C am 0 537 paddq %mm5, %mm6 C am 0 538 movd 12(%eax), %mm2 C am 0 539 paddq %mm0, %mm4 C am 0 540 movd 8(%edx), %mm5 C am 0 541 movd %mm6, (%edx) C am 0 542 psrlq $32, %mm6 C am 0 543 lea 16(%eax), %eax C am 0 544 lea 16(%edx), %edx C am 0 545 sub $4, %ecx C am 0 546 ja L(lam0) C am 0 547 pmuludq %mm7, %mm2 C am 0 548 paddq %mm4, %mm6 C am 0 549 paddq %mm1, %mm5 C am 0 550 movd -4(%edx), %mm4 C am 0 551 movd %mm6, -12(%edx) C am 0 552 psrlq $32, %mm6 C am 0 553 paddq %mm5, %mm6 C am 0 554 paddq %mm2, %mm4 C am 0 555L(3): movd %mm6, -8(%edx) C am 0 556 psrlq $32, %mm6 C am 0 557 paddq %mm4, %mm6 C am 0 558 movd %mm6, -4(%edx) C am 0 559 psrlq $32, %mm6 C am 0 560 movd %mm6, (%edx) C am 0 561 sub $4, %ebx C am 0 562 ja L(outer) C am 0 563 564 mov %edi, %edx 565 mov %esi, %eax 566 pop %edi 567 pop %ebx 568 pop %esi 569 570L(am3): C up[un-1..un-3] x up[un-4] 571 lea 8(%edx), %edx C rp2 += 2 572 movd (%eax), %mm7 573 movd 4(%eax), %mm1 574 movd 8(%eax), %mm2 575 movd 12(%eax), %mm3 576 movd (%edx), %mm4 577 pmuludq %mm7, %mm1 578 movd 4(%edx), %mm5 579 pmuludq %mm7, %mm2 580 movd 8(%edx), %mm6 581 pmuludq %mm7, %mm3 582 paddq %mm1, %mm4 583 paddq %mm2, %mm5 584 paddq %mm3, %mm6 585 movd %mm4, (%edx) 586 psrlq $32, %mm4 587 paddq %mm5, %mm4 588 movd %mm4, 4(%edx) 589 psrlq $32, %mm4 590 paddq %mm6, %mm4 591 movd %mm4, 8(%edx) 592 psrlq $32, %mm4 593 movd %mm4, 12(%edx) C FIXME feed through! 594 lea 4(%eax), %eax 595 596L(am2): C up[un-1..un-2] x up[un-3] 597 lea 8(%edx), %edx C rp2 += 2 598 movd (%eax), %mm7 599 movd 4(%eax), %mm1 600 movd 8(%eax), %mm2 601 movd (%edx), %mm4 602 movd 4(%edx), %mm5 603 pmuludq %mm7, %mm1 604 pmuludq %mm7, %mm2 605 paddq %mm1, %mm4 606 paddq %mm2, %mm5 607 movd %mm4, (%edx) 608 psrlq $32, %mm4 609 paddq %mm5, %mm4 610 movd %mm4, 4(%edx) 611 psrlq $32, %mm4 612 movd %mm4, 8(%edx) C FIXME feed through! 613 lea 4(%eax), %eax 614 615L(am1): C up[un-1] x up[un-2] 616 lea 8(%edx), %edx C rp2 += 2 617 movd (%eax), %mm7 618 movd 4(%eax), %mm2 619 movd (%edx), %mm4 620 pmuludq %mm7, %mm2 621 paddq %mm2, %mm4 622 movd %mm4, (%edx) 623 psrlq $32, %mm4 624 movd %mm4, 4(%edx) 625 626C *** diag stuff, use elementary code for now 627 628 mov 4(%esp), %edx C rp 629 mov 8(%esp), %eax C up 630 mov 12(%esp), %ecx C un 631 632 movd (%eax), %mm2 633 pmuludq %mm2, %mm2 C src[0]^2 634 635 pcmpeqd %mm7, %mm7 636 psrlq $32, %mm7 637 638 movd 4(%edx), %mm3 C dst[1] 639 640 movd %mm2, (%edx) 641 psrlq $32, %mm2 642 643 psllq $1, %mm3 C 2*dst[1] 644 paddq %mm3, %mm2 645 movd %mm2, 4(%edx) 646 psrlq $32, %mm2 647 648 sub $2, %ecx 649 650L(diag): 651 movd 4(%eax), %mm0 C src limb 652 add $4, %eax 653 pmuludq %mm0, %mm0 654 movq %mm7, %mm1 655 pand %mm0, %mm1 C diagonal low 656 psrlq $32, %mm0 C diagonal high 657 658 movd 8(%edx), %mm3 659 psllq $1, %mm3 C 2*dst[i] 660 paddq %mm3, %mm1 661 paddq %mm1, %mm2 662 movd %mm2, 8(%edx) 663 psrlq $32, %mm2 664 665 movd 12(%edx), %mm3 666 psllq $1, %mm3 C 2*dst[i+1] 667 paddq %mm3, %mm0 668 paddq %mm0, %mm2 669 movd %mm2, 12(%edx) 670 add $8, %edx 671 psrlq $32, %mm2 672 673 sub $1, %ecx 674 jnz L(diag) 675 676 movd 4(%eax), %mm0 C src[size-1] 677 pmuludq %mm0, %mm0 678 pand %mm0, %mm7 C diagonal low 679 psrlq $32, %mm0 C diagonal high 680 681 movd 8(%edx), %mm3 C dst[2*size-2] 682 psllq $1, %mm3 683 paddq %mm3, %mm7 684 paddq %mm7, %mm2 685 movd %mm2, 8(%edx) 686 psrlq $32, %mm2 687 688 paddq %mm0, %mm2 689 movd %mm2, 12(%edx) C dst[2*size-1] 690 691 emms 692 ret 693 694EPILOGUE() 695