1;****************************************************************************** 2;* 36 point SSE-optimized IMDCT transform 3;* Copyright (c) 2011 Vitor Sessak 4;* 5;* This file is part of Libav. 6;* 7;* Libav is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* Libav is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with Libav; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86inc.asm" 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27align 16 28ps_mask: dd 0, ~0, ~0, ~0 29ps_mask2: dd 0, ~0, 0, ~0 30ps_mask3: dd 0, 0, 0, ~0 31ps_mask4: dd 0, ~0, 0, 0 32 33ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038 34ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038 35ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433 36ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038 37ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530 38ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097 39ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097 40 41ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 42ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 43 44ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461 45 dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349 46 dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896 47 dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991 48 dd 1.0, 0.70710678118654752439, 0.0, 0.0 49 50ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 51 dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349 52 dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896 53 dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 54 dd 1.0, 0.70710678118654752439, 0.0, 0.0 55 56costabs: times 4 dd 0.98480773 57 times 4 dd 0.93969262 58 times 4 dd 0.86602539 59 times 4 dd -0.76604444 60 times 4 dd -0.64278764 61 times 4 dd 0.50000000 62 times 4 dd -0.50000000 63 times 4 dd -0.34202015 64 times 4 dd -0.17364818 65 times 4 dd 0.50190992 66 times 4 dd 0.51763808 67 times 4 dd 0.55168896 68 times 4 dd 0.61038726 69 times 4 dd 0.70710677 70 times 4 dd 0.87172341 71 times 4 dd 1.18310082 72 times 4 dd 1.93185163 73 times 4 dd 5.73685646 74 75%define SBLIMIT 32 76SECTION_TEXT 77 78%macro PSHUFD 3 79%if cpuflag(sse2) && notcpuflag(avx) 80 pshufd %1, %2, %3 81%else 82 shufps %1, %2, %2, %3 83%endif 84%endmacro 85 86; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} 87; output %1={x3,x4,y1,y2} 88%macro BUILDINVHIGHLOW 3 89%if cpuflag(avx) 90 shufps %1, %2, %3, 0x4e 91%else 92 movlhps %1, %3 93 movhlps %1, %2 94%endif 95%endmacro 96 97; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} 98; output %1={x4,y1,y2,y3} 99%macro ROTLEFT 3 100%if cpuflag(ssse3) 101 palignr %1, %3, %2, 12 102%else 103 BUILDINVHIGHLOW %1, %2, %3 104 shufps %1, %1, %3, 0x99 105%endif 106%endmacro 107 108%macro INVERTHL 2 109%if cpuflag(sse2) 110 PSHUFD %1, %2, 0x4e 111%else 112 movhlps %1, %2 113 movlhps %1, %2 114%endif 115%endmacro 116 117%macro BUTTERF 3 118 INVERTHL %2, %1 119 xorps %1, [ps_p1p1m1m1] 120 addps %1, %2 121%if cpuflag(sse3) 122 mulps %1, %1, [ps_cosh_sse3 + %3] 123 PSHUFD %2, %1, 0xb1 124 addsubps %1, %1, %2 125%else 126 mulps %1, [ps_cosh + %3] 127 PSHUFD %2, %1, 0xb1 128 xorps %1, [ps_p1m1p1m1] 129 addps %1, %2 130%endif 131%endmacro 132 133%macro STORE 4 134 movhlps %2, %1 135 movss [%3 ], %1 136 movss [%3 + 2*%4], %2 137 shufps %1, %1, 0xb1 138 movss [%3 + %4], %1 139 movhlps %2, %1 140 movss [%3 + 3*%4], %2 141%endmacro 142 143%macro LOAD 4 144 movlps %1, [%3 ] 145 movhps %1, [%3 + %4] 146 movlps %2, [%3 + 2*%4] 147 movhps %2, [%3 + 3*%4] 148 shufps %1, %2, 0x88 149%endmacro 150 151%macro LOADA64 2 152%if cpuflag(avx) 153 movu %1, [%2] 154%else 155 movlps %1, [%2] 156 movhps %1, [%2 + 8] 157%endif 158%endmacro 159 160%macro DEFINE_IMDCT 0 161cglobal imdct36_float, 4,4,9, out, buf, in, win 162 163 ; for(i=17;i>=1;i--) in[i] += in[i-1]; 164 LOADA64 m0, inq 165 LOADA64 m1, inq + 16 166 167 ROTLEFT m5, m0, m1 168 169 PSHUFD m6, m0, 0x93 170 andps m6, m6, [ps_mask] 171 addps m0, m0, m6 172 173 LOADA64 m2, inq + 32 174 175 ROTLEFT m7, m1, m2 176 177 addps m1, m1, m5 178 LOADA64 m3, inq + 48 179 180 ROTLEFT m5, m2, m3 181 182 xorps m4, m4, m4 183 movlps m4, [inq+64] 184 BUILDINVHIGHLOW m6, m3, m4 185 shufps m6, m6, m4, 0xa9 186 187 addps m4, m4, m6 188 addps m2, m2, m7 189 addps m3, m3, m5 190 191 ; for(i=17;i>=3;i-=2) in[i] += in[i-2]; 192 movlhps m5, m5, m0 193 andps m5, m5, [ps_mask3] 194 195 BUILDINVHIGHLOW m7, m0, m1 196 andps m7, m7, [ps_mask2] 197 198 addps m0, m0, m5 199 200 BUILDINVHIGHLOW m6, m1, m2 201 andps m6, m6, [ps_mask2] 202 203 addps m1, m1, m7 204 205 BUILDINVHIGHLOW m7, m2, m3 206 andps m7, m7, [ps_mask2] 207 208 addps m2, m2, m6 209 210 movhlps m6, m6, m3 211 andps m6, m6, [ps_mask4] 212 213 addps m3, m3, m7 214 addps m4, m4, m6 215 216 ; Populate tmp[] 217 movlhps m6, m1, m5 ; zero out high values 218 subps m6, m6, m4 219 220 subps m5, m0, m3 221 222%ifdef ARCH_X86_64 223 SWAP m5, m8 224%endif 225 226 mulps m7, m2, [ps_val1] 227 228%ifdef ARCH_X86_64 229 mulps m5, m8, [ps_val2] 230%else 231 mulps m5, m5, [ps_val2] 232%endif 233 addps m7, m7, m5 234 235 mulps m5, m6, [ps_val1] 236 subps m7, m7, m5 237 238%ifdef ARCH_X86_64 239 SWAP m5, m8 240%else 241 subps m5, m0, m3 242%endif 243 244 subps m5, m5, m6 245 addps m5, m5, m2 246 247 shufps m6, m4, m3, 0xe4 248 subps m6, m6, m2 249 mulps m6, m6, [ps_val3] 250 251 addps m4, m4, m1 252 mulps m4, m4, [ps_val4] 253 254 shufps m1, m1, m0, 0xe4 255 addps m1, m1, m2 256 mulps m1, m1, [ps_val5] 257 258 mulps m3, m3, [ps_val6] 259 mulps m0, m0, [ps_val7] 260 addps m0, m0, m3 261 262 xorps m2, m1, [ps_p1p1m1m1] 263 subps m2, m2, m4 264 addps m2, m2, m0 265 266 addps m3, m4, m0 267 subps m3, m3, m6 268 xorps m3, m3, [ps_p1p1m1m1] 269 270 shufps m0, m0, m4, 0xe4 271 subps m0, m0, m1 272 addps m0, m0, m6 273 274 BUILDINVHIGHLOW m4, m2, m3 275 shufps m3, m3, m2, 0x4e 276 277 ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5} 278 279 BUTTERF m0, m1, 0 280 BUTTERF m7, m2, 16 281 BUTTERF m3, m6, 32 282 BUTTERF m4, m1, 48 283 284 mulps m5, m5, [ps_cosh + 64] 285 PSHUFD m1, m5, 0xe1 286 xorps m5, m5, [ps_p1m1p1m1] 287 addps m5, m5, m1 288 289 ; permutates: 290 ; m0 0 1 2 3 => 2 6 10 14 m1 291 ; m7 4 5 6 7 => 3 7 11 15 m2 292 ; m3 8 9 10 11 => 17 13 9 5 m3 293 ; m4 12 13 14 15 => 16 12 8 4 m5 294 ; m5 16 17 xx xx => 0 1 xx xx m0 295 296 unpckhps m1, m0, m7 297 unpckhps m6, m3, m4 298 movhlps m2, m6, m1 299 movlhps m1, m1, m6 300 301 unpcklps m5, m5, m4 302 unpcklps m3, m3, m7 303 movhlps m4, m3, m5 304 movlhps m5, m5, m3 305 SWAP m4, m3 306 ; permutation done 307 308 PSHUFD m6, m2, 0xb1 309 movss m4, [bufq + 4*68] 310 movss m7, [bufq + 4*64] 311 unpcklps m7, m7, m4 312 mulps m6, m6, [winq + 16*4] 313 addps m6, m6, m7 314 movss [outq + 64*SBLIMIT], m6 315 shufps m6, m6, m6, 0xb1 316 movss [outq + 68*SBLIMIT], m6 317 318 mulps m6, m3, [winq + 4*4] 319 LOAD m4, m7, bufq + 4*16, 16 320 addps m6, m6, m4 321 STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT 322 323 shufps m4, m0, m3, 0xb5 324 mulps m4, m4, [winq + 8*4] 325 LOAD m7, m6, bufq + 4*32, 16 326 addps m4, m4, m7 327 STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT 328 329 shufps m3, m3, m2, 0xb1 330 mulps m3, m3, [winq + 12*4] 331 LOAD m7, m6, bufq + 4*48, 16 332 addps m3, m3, m7 333 STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT 334 335 mulps m2, m2, [winq] 336 LOAD m6, m7, bufq, 16 337 addps m2, m2, m6 338 STORE m2, m7, outq, 4*SBLIMIT 339 340 mulps m4, m1, [winq + 20*4] 341 STORE m4, m7, bufq, 16 342 343 mulps m3, m5, [winq + 24*4] 344 STORE m3, m7, bufq + 4*16, 16 345 346 shufps m0, m0, m5, 0xb0 347 mulps m0, m0, [winq + 28*4] 348 STORE m0, m7, bufq + 4*32, 16 349 350 shufps m5, m5, m1, 0xb1 351 mulps m5, m5, [winq + 32*4] 352 STORE m5, m7, bufq + 4*48, 16 353 354 shufps m1, m1, m1, 0xb1 355 mulps m1, m1, [winq + 36*4] 356 movss [bufq + 4*64], m1 357 shufps m1, m1, 0xb1 358 movss [bufq + 4*68], m1 359 RET 360%endmacro 361 362INIT_XMM sse 363DEFINE_IMDCT 364 365INIT_XMM sse2 366DEFINE_IMDCT 367 368INIT_XMM sse3 369DEFINE_IMDCT 370 371INIT_XMM ssse3 372DEFINE_IMDCT 373 374INIT_XMM avx 375DEFINE_IMDCT 376 377INIT_XMM sse 378 379%ifdef ARCH_X86_64 380%define SPILL SWAP 381%define UNSPILL SWAP 382%define SPILLED(x) m %+ x 383%else 384%define SPILLED(x) [tmpq+(x-8)*16 + 32*4] 385%macro SPILL 2 ; xmm#, mempos 386 movaps SPILLED(%2), m%1 387%endmacro 388%macro UNSPILL 2 389 movaps m%1, SPILLED(%2) 390%endmacro 391%endif 392 393%macro DEFINE_FOUR_IMDCT 0 394cglobal four_imdct36_float, 5,5,8, out, buf, in, win, tmp 395 movlps m0, [inq+64] 396 movhps m0, [inq+64 + 72] 397 movlps m3, [inq+64 + 2*72] 398 movhps m3, [inq+64 + 3*72] 399 400 shufps m5, m0, m3, 0xdd 401 shufps m0, m0, m3, 0x88 402 403 mova m1, [inq+48] 404 movu m6, [inq+48 + 72] 405 mova m7, [inq+48 + 2*72] 406 movu m3, [inq+48 + 3*72] 407 408 TRANSPOSE4x4PS 1, 6, 7, 3, 4 409 410 addps m4, m6, m7 411 mova [tmpq+4*28], m4 412 413 addps m7, m3 414 addps m6, m1 415 addps m3, m0 416 addps m0, m5 417 addps m0, m7 418 addps m7, m6 419 mova [tmpq+4*12], m7 420 SPILL 3, 12 421 422 mova m4, [inq+32] 423 movu m5, [inq+32 + 72] 424 mova m2, [inq+32 + 2*72] 425 movu m7, [inq+32 + 3*72] 426 427 TRANSPOSE4x4PS 4, 5, 2, 7, 3 428 429 addps m1, m7 430 SPILL 1, 11 431 432 addps m3, m5, m2 433 SPILL 3, 13 434 435 addps m7, m2 436 addps m5, m4 437 addps m6, m7 438 mova [tmpq], m6 439 addps m7, m5 440 mova [tmpq+4*16], m7 441 442 mova m2, [inq+16] 443 movu m7, [inq+16 + 72] 444 mova m1, [inq+16 + 2*72] 445 movu m6, [inq+16 + 3*72] 446 447 TRANSPOSE4x4PS 2, 7, 1, 6, 3 448 449 addps m4, m6 450 addps m6, m1 451 addps m1, m7 452 addps m7, m2 453 addps m5, m6 454 SPILL 5, 15 455 addps m6, m7 456 mulps m6, [costabs + 16*2] 457 mova [tmpq+4*8], m6 458 SPILL 1, 10 459 SPILL 0, 14 460 461 mova m1, [inq] 462 movu m6, [inq + 72] 463 mova m3, [inq + 2*72] 464 movu m5, [inq + 3*72] 465 466 TRANSPOSE4x4PS 1, 6, 3, 5, 0 467 468 addps m2, m5 469 addps m5, m3 470 addps m7, m5 471 addps m3, m6 472 addps m6, m1 473 SPILL 7, 8 474 addps m5, m6 475 SPILL 6, 9 476 addps m6, m4, SPILLED(12) 477 subps m6, m2 478 UNSPILL 7, 11 479 SPILL 5, 11 480 subps m5, m1, m7 481 mulps m7, [costabs + 16*5] 482 addps m7, m1 483 mulps m0, m6, [costabs + 16*6] 484 addps m0, m5 485 mova [tmpq+4*24], m0 486 addps m6, m5 487 mova [tmpq+4*4], m6 488 addps m6, m4, m2 489 mulps m6, [costabs + 16*1] 490 subps m4, SPILLED(12) 491 mulps m4, [costabs + 16*8] 492 addps m2, SPILLED(12) 493 mulps m2, [costabs + 16*3] 494 subps m5, m7, m6 495 subps m5, m2 496 addps m6, m7 497 addps m6, m4 498 addps m7, m2 499 subps m7, m4 500 mova [tmpq+4*20], m7 501 mova m2, [tmpq+4*28] 502 mova [tmpq+4*28], m5 503 UNSPILL 7, 13 504 subps m5, m7, m2 505 mulps m5, [costabs + 16*7] 506 UNSPILL 1, 10 507 mulps m1, [costabs + 16*2] 508 addps m4, m3, m2 509 mulps m4, [costabs + 16*4] 510 addps m2, m7 511 addps m7, m3 512 mulps m7, [costabs] 513 subps m3, m2 514 mulps m3, [costabs + 16*2] 515 addps m2, m7, m5 516 addps m2, m1 517 SPILL 2, 10 518 addps m7, m4 519 subps m7, m1 520 SPILL 7, 12 521 subps m5, m4 522 subps m5, m1 523 UNSPILL 0, 14 524 SPILL 5, 13 525 addps m1, m0, SPILLED(15) 526 subps m1, SPILLED(8) 527 mova m4, [costabs + 16*5] 528 mulps m4, [tmpq] 529 UNSPILL 2, 9 530 addps m4, m2 531 subps m2, [tmpq] 532 mulps m5, m1, [costabs + 16*6] 533 addps m5, m2 534 SPILL 5, 9 535 addps m2, m1 536 SPILL 2, 14 537 UNSPILL 5, 15 538 subps m7, m5, m0 539 addps m5, SPILLED(8) 540 mulps m5, [costabs + 16*1] 541 mulps m7, [costabs + 16*8] 542 addps m0, SPILLED(8) 543 mulps m0, [costabs + 16*3] 544 subps m2, m4, m5 545 subps m2, m0 546 SPILL 2, 15 547 addps m5, m4 548 addps m5, m7 549 addps m4, m0 550 subps m4, m7 551 SPILL 4, 8 552 mova m7, [tmpq+4*16] 553 mova m2, [tmpq+4*12] 554 addps m0, m7, m2 555 subps m0, SPILLED(11) 556 mulps m0, [costabs + 16*2] 557 addps m4, m7, SPILLED(11) 558 mulps m4, [costabs] 559 subps m7, m2 560 mulps m7, [costabs + 16*7] 561 addps m2, SPILLED(11) 562 mulps m2, [costabs + 16*4] 563 addps m1, m7, [tmpq+4*8] 564 addps m1, m4 565 addps m4, m2 566 subps m4, [tmpq+4*8] 567 SPILL 4, 11 568 subps m7, m2 569 subps m7, [tmpq+4*8] 570 addps m4, m6, SPILLED(10) 571 subps m6, SPILLED(10) 572 addps m2, m5, m1 573 mulps m2, [costabs + 16*9] 574 subps m5, m1 575 mulps m5, [costabs + 16*17] 576 subps m1, m4, m2 577 addps m4, m2 578 mulps m2, m1, [winq+4*36] 579 addps m2, [bufq+4*36] 580 mova [outq+1152], m2 581 mulps m1, [winq+4*32] 582 addps m1, [bufq+4*32] 583 mova [outq+1024], m1 584 mulps m1, m4, [winq+4*116] 585 mova [bufq+4*36], m1 586 mulps m4, [winq+4*112] 587 mova [bufq+4*32], m4 588 addps m2, m6, m5 589 subps m6, m5 590 mulps m1, m6, [winq+4*68] 591 addps m1, [bufq+4*68] 592 mova [outq+2176], m1 593 mulps m6, [winq] 594 addps m6, [bufq] 595 mova [outq], m6 596 mulps m1, m2, [winq+4*148] 597 mova [bufq+4*68], m1 598 mulps m2, [winq+4*80] 599 mova [bufq], m2 600 addps m5, m3, [tmpq+4*24] 601 mova m2, [tmpq+4*24] 602 subps m2, m3 603 mova m1, SPILLED(9) 604 subps m1, m0 605 mulps m1, [costabs + 16*10] 606 addps m0, SPILLED(9) 607 mulps m0, [costabs + 16*16] 608 addps m6, m5, m1 609 subps m5, m1 610 mulps m3, m5, [winq+4*40] 611 addps m3, [bufq+4*40] 612 mova [outq+1280], m3 613 mulps m5, [winq+4*28] 614 addps m5, [bufq+4*28] 615 mova [outq+896], m5 616 mulps m1, m6, [winq+4*120] 617 mova [bufq+4*40], m1 618 mulps m6, [winq+4*108] 619 mova [bufq+4*28], m6 620 addps m1, m2, m0 621 subps m2, m0 622 mulps m5, m2, [winq+4*64] 623 addps m5, [bufq+4*64] 624 mova [outq+2048], m5 625 mulps m2, [winq+4*4] 626 addps m2, [bufq+4*4] 627 mova [outq+128], m2 628 mulps m0, m1, [winq+4*144] 629 mova [bufq+4*64], m0 630 mulps m1, [winq+4*84] 631 mova [bufq+4*4], m1 632 mova m1, [tmpq+4*28] 633 mova m5, m1 634 addps m1, SPILLED(13) 635 subps m5, SPILLED(13) 636 UNSPILL 3, 15 637 addps m2, m7, m3 638 mulps m2, [costabs + 16*11] 639 subps m3, m7 640 mulps m3, [costabs + 16*15] 641 addps m0, m2, m1 642 subps m1, m2 643 SWAP m0, m2 644 mulps m6, m1, [winq+4*44] 645 addps m6, [bufq+4*44] 646 mova [outq+1408], m6 647 mulps m1, [winq+4*24] 648 addps m1, [bufq+4*24] 649 mova [outq+768], m1 650 mulps m0, m2, [winq+4*124] 651 mova [bufq+4*44], m0 652 mulps m2, [winq+4*104] 653 mova [bufq+4*24], m2 654 addps m0, m5, m3 655 subps m5, m3 656 mulps m1, m5, [winq+4*60] 657 addps m1, [bufq+4*60] 658 mova [outq+1920], m1 659 mulps m5, [winq+4*8] 660 addps m5, [bufq+4*8] 661 mova [outq+256], m5 662 mulps m1, m0, [winq+4*140] 663 mova [bufq+4*60], m1 664 mulps m0, [winq+4*88] 665 mova [bufq+4*8], m0 666 mova m1, [tmpq+4*20] 667 addps m1, SPILLED(12) 668 mova m2, [tmpq+4*20] 669 subps m2, SPILLED(12) 670 UNSPILL 7, 8 671 subps m0, m7, SPILLED(11) 672 addps m7, SPILLED(11) 673 mulps m4, m7, [costabs + 16*12] 674 mulps m0, [costabs + 16*14] 675 addps m5, m1, m4 676 subps m1, m4 677 mulps m7, m1, [winq+4*48] 678 addps m7, [bufq+4*48] 679 mova [outq+1536], m7 680 mulps m1, [winq+4*20] 681 addps m1, [bufq+4*20] 682 mova [outq+640], m1 683 mulps m1, m5, [winq+4*128] 684 mova [bufq+4*48], m1 685 mulps m5, [winq+4*100] 686 mova [bufq+4*20], m5 687 addps m6, m2, m0 688 subps m2, m0 689 mulps m1, m2, [winq+4*56] 690 addps m1, [bufq+4*56] 691 mova [outq+1792], m1 692 mulps m2, [winq+4*12] 693 addps m2, [bufq+4*12] 694 mova [outq+384], m2 695 mulps m0, m6, [winq+4*136] 696 mova [bufq+4*56], m0 697 mulps m6, [winq+4*92] 698 mova [bufq+4*12], m6 699 UNSPILL 0, 14 700 mulps m0, [costabs + 16*13] 701 mova m3, [tmpq+4*4] 702 addps m2, m0, m3 703 subps m3, m0 704 mulps m0, m3, [winq+4*52] 705 addps m0, [bufq+4*52] 706 mova [outq+1664], m0 707 mulps m3, [winq+4*16] 708 addps m3, [bufq+4*16] 709 mova [outq+512], m3 710 mulps m0, m2, [winq+4*132] 711 mova [bufq+4*52], m0 712 mulps m2, [winq+4*96] 713 mova [bufq+4*16], m2 714 RET 715%endmacro 716 717INIT_XMM sse 718DEFINE_FOUR_IMDCT 719 720INIT_XMM avx 721DEFINE_FOUR_IMDCT 722