1;****************************************************************************** 2;* FFT transform with SSE/3DNow optimizations 3;* Copyright (c) 2008 Loren Merritt 4;* Copyright (c) 2011 Vitor Sessak 5;* 6;* This algorithm (though not any of the implementation details) is 7;* based on libdjbfft by D. J. Bernstein. 8;* 9;* This file is part of FFmpeg. 10;* 11;* FFmpeg is free software; you can redistribute it and/or 12;* modify it under the terms of the GNU Lesser General Public 13;* License as published by the Free Software Foundation; either 14;* version 2.1 of the License, or (at your option) any later version. 15;* 16;* FFmpeg is distributed in the hope that it will be useful, 17;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19;* Lesser General Public License for more details. 20;* 21;* You should have received a copy of the GNU Lesser General Public 22;* License along with FFmpeg; if not, write to the Free Software 23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24;****************************************************************************** 25 26; These functions are not individually interchangeable with the C versions. 27; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results 28; in blocks as conventient to the vector size. 29; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) 30 31%include "libavutil/x86/x86util.asm" 32 33%if ARCH_X86_64 34%define pointer resq 35%else 36%define pointer resd 37%endif 38 39SECTION_RODATA 32 40 41struc FFTContext 42 .nbits: resd 1 43 .reverse: resd 1 44 .revtab: pointer 1 45 .tmpbuf: pointer 1 46 .mdctsize: resd 1 47 .mdctbits: resd 1 48 .tcos: pointer 1 49 .tsin: pointer 1 50 .fftperm: pointer 1 51 .fftcalc: pointer 1 52 .imdctcalc:pointer 1 53 .imdcthalf:pointer 1 54endstruc 55 56%define M_SQRT1_2 0.70710678118654752440 57%define M_COS_PI_1_8 0.923879532511287 58%define M_COS_PI_3_8 0.38268343236509 59 60ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 61ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 62 63ps_root2: times 8 dd M_SQRT1_2 64ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 65ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 66 67perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 68perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 69ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 70ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 71ps_m1m1m1m1: times 4 dd 1<<31 72ps_m1p1: dd 1<<31, 0 73 74%assign i 16 75%rep 13 76cextern cos_ %+ i 77%assign i i<<1 78%endrep 79 80%if ARCH_X86_64 81 %define pointer dq 82%else 83 %define pointer dd 84%endif 85 86%macro IF0 1+ 87%endmacro 88%macro IF1 1+ 89 %1 90%endmacro 91 92SECTION_TEXT 93 94%macro T2_3DNOW 4 ; z0, z1, mem0, mem1 95 mova %1, %3 96 mova %2, %1 97 pfadd %1, %4 98 pfsub %2, %4 99%endmacro 100 101%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1 102 mova %5, %3 103 pfsub %3, %4 104 pfadd %5, %4 ; {t6,t5} 105 pxor %3, [ps_m1p1] ; {t8,t7} 106 mova %6, %1 107 movd [r0+12], %3 108 punpckhdq %3, [r0+8] 109 pfadd %1, %5 ; {r0,i0} 110 pfsub %6, %5 ; {r2,i2} 111 mova %4, %2 112 pfadd %2, %3 ; {r1,i1} 113 pfsub %4, %3 ; {r3,i3} 114 SWAP %3, %6 115%endmacro 116 117; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} 118; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} 119; %3, %4, %5 tmp 120; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} 121; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} 122%macro T8_AVX 5 123 vsubps %5, %1, %2 ; v = %1 - %2 124 vaddps %3, %1, %2 ; w = %1 + %2 125 vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 126 vpermilps %2, %2, [perm1] 127 vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} 128 vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} 129 vsubps %4, %5, %1 ; s = r - q 130 vaddps %1, %5, %1 ; u = r + q 131 vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} 132 vshufps %5, %4, %1, 0xbb 133 vshufps %3, %4, %1, 0xee 134 vperm2f128 %3, %3, %5, 0x13 135 vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} 136 vshufps %2, %1, %4, 0xdd 137 vshufps %1, %1, %4, 0x88 138 vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} 139 vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} 140 vsubps %5, %1, %3 141 vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} 142 vsubps %2, %4, %1 ; %2 = v - w 143 vaddps %1, %4, %1 ; %1 = v + w 144%endmacro 145 146; In SSE mode do one fft4 transforms 147; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} 148; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} 149; 150; In AVX mode do two fft4 transforms 151; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} 152; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} 153%macro T4_SSE 3 154 subps %3, %1, %2 ; {t3,t4,-t8,t7} 155 addps %1, %1, %2 ; {t1,t2,t6,t5} 156 xorps %3, %3, [ps_p1p1m1p1] 157 shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} 158 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} 159 subps %3, %1, %2 ; {r2,i2,r3,i3} 160 addps %1, %1, %2 ; {r0,i0,r1,i1} 161 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} 162 shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} 163%endmacro 164 165; In SSE mode do one FFT8 166; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} 167; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} 168; 169; In AVX mode do two FFT8 170; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} 171; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} 172; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} 173; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} 174%macro T8_SSE 6 175 addps %6, %3, %4 ; {t1,t2,t3,t4} 176 subps %3, %3, %4 ; {r5,i5,r7,i7} 177 shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} 178 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} 179 mulps %4, %4, [ps_root2] 180 addps %3, %3, %4 ; {t8,t7,ta,t9} 181 shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} 182 shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} 183 subps %3, %6, %4 ; {t6,t5,tc,tb} 184 addps %6, %6, %4 ; {t1,t2,t9,ta} 185 shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} 186 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} 187 subps %3, %1, %6 ; {r4,r5,r6,r7} 188 addps %1, %1, %6 ; {r0,r1,r2,r3} 189 subps %4, %2, %5 ; {i4,i5,i6,i7} 190 addps %2, %2, %5 ; {i0,i1,i2,i3} 191%endmacro 192 193; scheduled for cpu-bound sizes 194%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim 195IF%1 mova m4, Z(4) 196IF%1 mova m5, Z(5) 197 mova m0, %2 ; wre 198 mova m1, %3 ; wim 199 mulps m2, m4, m0 ; r2*wre 200IF%1 mova m6, Z2(6) 201 mulps m3, m5, m1 ; i2*wim 202IF%1 mova m7, Z2(7) 203 mulps m4, m4, m1 ; r2*wim 204 mulps m5, m5, m0 ; i2*wre 205 addps m2, m2, m3 ; r2*wre + i2*wim 206 mulps m3, m1, m7 ; i3*wim 207 subps m5, m5, m4 ; i2*wre - r2*wim 208 mulps m1, m1, m6 ; r3*wim 209 mulps m4, m0, m6 ; r3*wre 210 mulps m0, m0, m7 ; i3*wre 211 subps m4, m4, m3 ; r3*wre - i3*wim 212 mova m3, Z(0) 213 addps m0, m0, m1 ; i3*wre + r3*wim 214 subps m1, m4, m2 ; t3 215 addps m4, m4, m2 ; t5 216 subps m3, m3, m4 ; r2 217 addps m4, m4, Z(0) ; r0 218 mova m6, Z(2) 219 mova Z(4), m3 220 mova Z(0), m4 221 subps m3, m5, m0 ; t4 222 subps m4, m6, m3 ; r3 223 addps m3, m3, m6 ; r1 224 mova Z2(6), m4 225 mova Z(2), m3 226 mova m2, Z(3) 227 addps m3, m5, m0 ; t6 228 subps m2, m2, m1 ; i3 229 mova m7, Z(1) 230 addps m1, m1, Z(3) ; i1 231 mova Z2(7), m2 232 mova Z(3), m1 233 subps m4, m7, m3 ; i2 234 addps m3, m3, m7 ; i0 235 mova Z(5), m4 236 mova Z(1), m3 237%endmacro 238 239; scheduled to avoid store->load aliasing 240%macro PASS_BIG 1 ; (!interleave) 241 mova m4, Z(4) ; r2 242 mova m5, Z(5) ; i2 243 mova m0, [wq] ; wre 244 mova m1, [wq+o1q] ; wim 245 mulps m2, m4, m0 ; r2*wre 246 mova m6, Z2(6) ; r3 247 mulps m3, m5, m1 ; i2*wim 248 mova m7, Z2(7) ; i3 249 mulps m4, m4, m1 ; r2*wim 250 mulps m5, m5, m0 ; i2*wre 251 addps m2, m2, m3 ; r2*wre + i2*wim 252 mulps m3, m1, m7 ; i3*wim 253 mulps m1, m1, m6 ; r3*wim 254 subps m5, m5, m4 ; i2*wre - r2*wim 255 mulps m4, m0, m6 ; r3*wre 256 mulps m0, m0, m7 ; i3*wre 257 subps m4, m4, m3 ; r3*wre - i3*wim 258 mova m3, Z(0) 259 addps m0, m0, m1 ; i3*wre + r3*wim 260 subps m1, m4, m2 ; t3 261 addps m4, m4, m2 ; t5 262 subps m3, m3, m4 ; r2 263 addps m4, m4, Z(0) ; r0 264 mova m6, Z(2) 265 mova Z(4), m3 266 mova Z(0), m4 267 subps m3, m5, m0 ; t4 268 subps m4, m6, m3 ; r3 269 addps m3, m3, m6 ; r1 270IF%1 mova Z2(6), m4 271IF%1 mova Z(2), m3 272 mova m2, Z(3) 273 addps m5, m5, m0 ; t6 274 subps m2, m2, m1 ; i3 275 mova m7, Z(1) 276 addps m1, m1, Z(3) ; i1 277IF%1 mova Z2(7), m2 278IF%1 mova Z(3), m1 279 subps m6, m7, m5 ; i2 280 addps m5, m5, m7 ; i0 281IF%1 mova Z(5), m6 282IF%1 mova Z(1), m5 283%if %1==0 284 INTERL m1, m3, m7, Z, 2 285 INTERL m2, m4, m0, Z2, 6 286 287 mova m1, Z(0) 288 mova m2, Z(4) 289 290 INTERL m5, m1, m3, Z, 0 291 INTERL m6, m2, m7, Z, 4 292%endif 293%endmacro 294 295%macro PUNPCK 3 296 mova %3, %1 297 punpckldq %1, %2 298 punpckhdq %3, %2 299%endmacro 300 301%define Z(x) [r0+mmsize*x] 302%define Z2(x) [r0+mmsize*x] 303%define ZH(x) [r0+mmsize*x+mmsize/2] 304 305INIT_YMM avx 306 307%if HAVE_AVX_EXTERNAL 308align 16 309fft8_avx: 310 mova m0, Z(0) 311 mova m1, Z(1) 312 T8_AVX m0, m1, m2, m3, m4 313 mova Z(0), m0 314 mova Z(1), m1 315 ret 316 317 318align 16 319fft16_avx: 320 mova m2, Z(2) 321 mova m3, Z(3) 322 T4_SSE m2, m3, m7 323 324 mova m0, Z(0) 325 mova m1, Z(1) 326 T8_AVX m0, m1, m4, m5, m7 327 328 mova m4, [ps_cos16_1] 329 mova m5, [ps_cos16_2] 330 vmulps m6, m2, m4 331 vmulps m7, m3, m5 332 vaddps m7, m7, m6 333 vmulps m2, m2, m5 334 vmulps m3, m3, m4 335 vsubps m3, m3, m2 336 vblendps m2, m7, m3, 0xf0 337 vperm2f128 m3, m7, m3, 0x21 338 vaddps m4, m2, m3 339 vsubps m2, m3, m2 340 vperm2f128 m2, m2, m2, 0x01 341 vsubps m3, m1, m2 342 vaddps m1, m1, m2 343 vsubps m5, m0, m4 344 vaddps m0, m0, m4 345 vextractf128 Z(0), m0, 0 346 vextractf128 ZH(0), m1, 0 347 vextractf128 Z(1), m0, 1 348 vextractf128 ZH(1), m1, 1 349 vextractf128 Z(2), m5, 0 350 vextractf128 ZH(2), m3, 0 351 vextractf128 Z(3), m5, 1 352 vextractf128 ZH(3), m3, 1 353 ret 354 355align 16 356fft32_avx: 357 call fft16_avx 358 359 mova m0, Z(4) 360 mova m1, Z(5) 361 362 T4_SSE m0, m1, m4 363 364 mova m2, Z(6) 365 mova m3, Z(7) 366 367 T8_SSE m0, m1, m2, m3, m4, m6 368 ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} 369 ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} 370 371 vperm2f128 m4, m0, m2, 0x20 372 vperm2f128 m5, m1, m3, 0x20 373 vperm2f128 m6, m0, m2, 0x31 374 vperm2f128 m7, m1, m3, 0x31 375 376 PASS_SMALL 0, [cos_32], [cos_32+32] 377 378 ret 379 380fft32_interleave_avx: 381 call fft32_avx 382 mov r2d, 32 383.deint_loop: 384 mova m2, Z(0) 385 mova m3, Z(1) 386 vunpcklps m0, m2, m3 387 vunpckhps m1, m2, m3 388 vextractf128 Z(0), m0, 0 389 vextractf128 ZH(0), m1, 0 390 vextractf128 Z(1), m0, 1 391 vextractf128 ZH(1), m1, 1 392 add r0, mmsize*2 393 sub r2d, mmsize/4 394 jg .deint_loop 395 ret 396 397%endif 398 399INIT_XMM sse 400 401align 16 402fft4_avx: 403fft4_sse: 404 mova m0, Z(0) 405 mova m1, Z(1) 406 T4_SSE m0, m1, m2 407 mova Z(0), m0 408 mova Z(1), m1 409 ret 410 411align 16 412fft8_sse: 413 mova m0, Z(0) 414 mova m1, Z(1) 415 T4_SSE m0, m1, m2 416 mova m2, Z(2) 417 mova m3, Z(3) 418 T8_SSE m0, m1, m2, m3, m4, m5 419 mova Z(0), m0 420 mova Z(1), m1 421 mova Z(2), m2 422 mova Z(3), m3 423 ret 424 425align 16 426fft16_sse: 427 mova m0, Z(0) 428 mova m1, Z(1) 429 T4_SSE m0, m1, m2 430 mova m2, Z(2) 431 mova m3, Z(3) 432 T8_SSE m0, m1, m2, m3, m4, m5 433 mova m4, Z(4) 434 mova m5, Z(5) 435 mova Z(0), m0 436 mova Z(1), m1 437 mova Z(2), m2 438 mova Z(3), m3 439 T4_SSE m4, m5, m6 440 mova m6, Z2(6) 441 mova m7, Z2(7) 442 T4_SSE m6, m7, m0 443 PASS_SMALL 0, [cos_16], [cos_16+16] 444 ret 445 446 447%macro FFT48_3DNOW 0 448align 16 449fft4 %+ SUFFIX: 450 T2_3DNOW m0, m1, Z(0), Z(1) 451 mova m2, Z(2) 452 mova m3, Z(3) 453 T4_3DNOW m0, m1, m2, m3, m4, m5 454 PUNPCK m0, m1, m4 455 PUNPCK m2, m3, m5 456 mova Z(0), m0 457 mova Z(1), m4 458 mova Z(2), m2 459 mova Z(3), m5 460 ret 461 462align 16 463fft8 %+ SUFFIX: 464 T2_3DNOW m0, m1, Z(0), Z(1) 465 mova m2, Z(2) 466 mova m3, Z(3) 467 T4_3DNOW m0, m1, m2, m3, m4, m5 468 mova Z(0), m0 469 mova Z(2), m2 470 T2_3DNOW m4, m5, Z(4), Z(5) 471 T2_3DNOW m6, m7, Z2(6), Z2(7) 472 PSWAPD m0, m5 473 PSWAPD m2, m7 474 pxor m0, [ps_m1p1] 475 pxor m2, [ps_m1p1] 476 pfsub m5, m0 477 pfadd m7, m2 478 pfmul m5, [ps_root2] 479 pfmul m7, [ps_root2] 480 T4_3DNOW m1, m3, m5, m7, m0, m2 481 mova Z(5), m5 482 mova Z2(7), m7 483 mova m0, Z(0) 484 mova m2, Z(2) 485 T4_3DNOW m0, m2, m4, m6, m5, m7 486 PUNPCK m0, m1, m5 487 PUNPCK m2, m3, m7 488 mova Z(0), m0 489 mova Z(1), m5 490 mova Z(2), m2 491 mova Z(3), m7 492 PUNPCK m4, Z(5), m5 493 PUNPCK m6, Z2(7), m7 494 mova Z(4), m4 495 mova Z(5), m5 496 mova Z2(6), m6 497 mova Z2(7), m7 498 ret 499%endmacro 500 501%if ARCH_X86_32 502INIT_MMX 3dnowext 503FFT48_3DNOW 504 505INIT_MMX 3dnow 506FFT48_3DNOW 507%endif 508 509%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] 510%define Z2(x) [zcq + o3q + mmsize*(x&1)] 511%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] 512%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] 513 514%macro DECL_PASS 2+ ; name, payload 515align 16 516%1: 517DEFINE_ARGS zc, w, n, o1, o3 518 lea o3q, [nq*3] 519 lea o1q, [nq*8] 520 shl o3q, 4 521.loop: 522 %2 523 add zcq, mmsize*2 524 add wq, mmsize 525 sub nd, mmsize/8 526 jg .loop 527 rep ret 528%endmacro 529 530%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs 531 lea r2, [dispatch_tab%1] 532 mov r2, [r2 + (%2q-2)*gprsize] 533%ifdef PIC 534 lea r3, [$$] 535 add r2, r3 536%endif 537 call r2 538%endmacro ; FFT_DISPATCH 539 540INIT_YMM avx 541 542%if HAVE_AVX_EXTERNAL 543%macro INTERL_AVX 5 544 vunpckhps %3, %2, %1 545 vunpcklps %2, %2, %1 546 vextractf128 %4(%5), %2, 0 547 vextractf128 %4 %+ H(%5), %3, 0 548 vextractf128 %4(%5 + 1), %2, 1 549 vextractf128 %4 %+ H(%5 + 1), %3, 1 550%endmacro 551 552%define INTERL INTERL_AVX 553 554DECL_PASS pass_avx, PASS_BIG 1 555DECL_PASS pass_interleave_avx, PASS_BIG 0 556 557cglobal fft_calc, 2,5,8 558 mov r3d, [r0 + FFTContext.nbits] 559 mov r0, r1 560 mov r1, r3 561 FFT_DISPATCH _interleave %+ SUFFIX, r1 562 REP_RET 563 564%endif 565 566INIT_XMM sse 567 568%macro INTERL_SSE 5 569 mova %3, %2 570 unpcklps %2, %1 571 unpckhps %3, %1 572 mova %4(%5), %2 573 mova %4(%5+1), %3 574%endmacro 575 576%define INTERL INTERL_SSE 577 578DECL_PASS pass_sse, PASS_BIG 1 579DECL_PASS pass_interleave_sse, PASS_BIG 0 580 581%macro FFT_CALC_FUNC 0 582cglobal fft_calc, 2,5,8 583 mov r3d, [r0 + FFTContext.nbits] 584 PUSH r1 585 PUSH r3 586 mov r0, r1 587 mov r1, r3 588 FFT_DISPATCH _interleave %+ SUFFIX, r1 589 POP rcx 590 POP r4 591 cmp rcx, 3+(mmsize/16) 592 jg .end 593 mov r2, -1 594 add rcx, 3 595 shl r2, cl 596 sub r4, r2 597.loop: 598%if mmsize == 8 599 PSWAPD m0, [r4 + r2 + 4] 600 mova [r4 + r2 + 4], m0 601%else 602 movaps xmm0, [r4 + r2] 603 movaps xmm1, xmm0 604 unpcklps xmm0, [r4 + r2 + 16] 605 unpckhps xmm1, [r4 + r2 + 16] 606 movaps [r4 + r2], xmm0 607 movaps [r4 + r2 + 16], xmm1 608%endif 609 add r2, mmsize*2 610 jl .loop 611.end: 612%if cpuflag(3dnow) 613 femms 614 RET 615%else 616 REP_RET 617%endif 618%endmacro 619 620%if ARCH_X86_32 621INIT_MMX 3dnow 622FFT_CALC_FUNC 623INIT_MMX 3dnowext 624FFT_CALC_FUNC 625%endif 626INIT_XMM sse 627FFT_CALC_FUNC 628 629cglobal fft_permute, 2,7,1 630 mov r4, [r0 + FFTContext.revtab] 631 mov r5, [r0 + FFTContext.tmpbuf] 632 mov ecx, [r0 + FFTContext.nbits] 633 mov r2, 1 634 shl r2, cl 635 xor r0, r0 636%if ARCH_X86_32 637 mov r1, r1m 638%endif 639.loop: 640 movaps xmm0, [r1 + 8*r0] 641 movzx r6, word [r4 + 2*r0] 642 movzx r3, word [r4 + 2*r0 + 2] 643 movlps [r5 + 8*r6], xmm0 644 movhps [r5 + 8*r3], xmm0 645 add r0, 2 646 cmp r0, r2 647 jl .loop 648 shl r2, 3 649 add r1, r2 650 add r5, r2 651 neg r2 652; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B 653.loopcopy: 654 movaps xmm0, [r5 + r2] 655 movaps xmm1, [r5 + r2 + 16] 656 movaps [r1 + r2], xmm0 657 movaps [r1 + r2 + 16], xmm1 658 add r2, 32 659 jl .loopcopy 660 REP_RET 661 662%macro IMDCT_CALC_FUNC 0 663cglobal imdct_calc, 3,5,3 664 mov r3d, [r0 + FFTContext.mdctsize] 665 mov r4, [r0 + FFTContext.imdcthalf] 666 add r1, r3 667 PUSH r3 668 PUSH r1 669%if ARCH_X86_32 670 push r2 671 push r1 672 push r0 673%else 674 sub rsp, 8+32*WIN64 ; allocate win64 shadow space 675%endif 676 call r4 677%if ARCH_X86_32 678 add esp, 12 679%else 680 add rsp, 8+32*WIN64 681%endif 682 POP r1 683 POP r3 684 lea r0, [r1 + 2*r3] 685 mov r2, r3 686 sub r3, mmsize 687 neg r2 688 mova m2, [ps_m1m1m1m1] 689.loop: 690%if mmsize == 8 691 PSWAPD m0, [r1 + r3] 692 PSWAPD m1, [r0 + r2] 693 pxor m0, m2 694%else 695 mova m0, [r1 + r3] 696 mova m1, [r0 + r2] 697 shufps m0, m0, 0x1b 698 shufps m1, m1, 0x1b 699 xorps m0, m2 700%endif 701 mova [r0 + r3], m1 702 mova [r1 + r2], m0 703 sub r3, mmsize 704 add r2, mmsize 705 jl .loop 706%if cpuflag(3dnow) 707 femms 708 RET 709%else 710 REP_RET 711%endif 712%endmacro 713 714%if ARCH_X86_32 715INIT_MMX 3dnow 716IMDCT_CALC_FUNC 717INIT_MMX 3dnowext 718IMDCT_CALC_FUNC 719%endif 720 721INIT_XMM sse 722IMDCT_CALC_FUNC 723 724%if ARCH_X86_32 725INIT_MMX 3dnow 726%define mulps pfmul 727%define addps pfadd 728%define subps pfsub 729%define unpcklps punpckldq 730%define unpckhps punpckhdq 731DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] 732DECL_PASS pass_interleave_3dnow, PASS_BIG 0 733%define pass_3dnowext pass_3dnow 734%define pass_interleave_3dnowext pass_interleave_3dnow 735%endif 736 737%ifdef PIC 738%define SECTION_REL - $$ 739%else 740%define SECTION_REL 741%endif 742 743%macro DECL_FFT 1-2 ; nbits, suffix 744%ifidn %0, 1 745%xdefine fullsuffix SUFFIX 746%else 747%xdefine fullsuffix %2 %+ SUFFIX 748%endif 749%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL 750%if %1>=5 751%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL 752%endif 753%if %1>=6 754%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL 755%endif 756 757%assign n 1<<%1 758%rep 17-%1 759%assign n2 n/2 760%assign n4 n/4 761%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL 762 763align 16 764fft %+ n %+ fullsuffix: 765 call fft %+ n2 %+ SUFFIX 766 add r0, n*4 - (n&(-2<<%1)) 767 call fft %+ n4 %+ SUFFIX 768 add r0, n*2 - (n2&(-2<<%1)) 769 call fft %+ n4 %+ SUFFIX 770 sub r0, n*6 + (n2&(-2<<%1)) 771 lea r1, [cos_ %+ n] 772 mov r2d, n4/2 773 jmp pass %+ fullsuffix 774 775%assign n n*2 776%endrep 777%undef n 778 779align 8 780dispatch_tab %+ fullsuffix: pointer list_of_fft 781%endmacro ; DECL_FFT 782 783%if HAVE_AVX_EXTERNAL 784INIT_YMM avx 785DECL_FFT 6 786DECL_FFT 6, _interleave 787%endif 788INIT_XMM sse 789DECL_FFT 5 790DECL_FFT 5, _interleave 791%if ARCH_X86_32 792INIT_MMX 3dnow 793DECL_FFT 4 794DECL_FFT 4, _interleave 795INIT_MMX 3dnowext 796DECL_FFT 4 797DECL_FFT 4, _interleave 798%endif 799 800INIT_XMM sse 801%undef mulps 802%undef addps 803%undef subps 804%undef unpcklps 805%undef unpckhps 806 807%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 808%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8 809 PSWAPD m0, [%3+%2*4] 810 movq m2, [%3+%1*4-8] 811 movq m3, m0 812 punpckldq m0, m2 813 punpckhdq m2, m3 814 movd m1, [%4+%1*2-4] ; tcos[j] 815 movd m3, [%4+%2*2] ; tcos[n4-j-1] 816 punpckldq m1, [%5+%1*2-4] ; tsin[j] 817 punpckldq m3, [%5+%2*2] ; tsin[n4-j-1] 818 819 mova m4, m0 820 PSWAPD m5, m1 821 pfmul m0, m1 822 pfmul m4, m5 823 mova m6, m2 824 PSWAPD m5, m3 825 pfmul m2, m3 826 pfmul m6, m5 827%if cpuflag(3dnowext) 828 pfpnacc m0, m4 829 pfpnacc m2, m6 830%else 831 SBUTTERFLY dq, 0, 4, 1 832 SBUTTERFLY dq, 2, 6, 3 833 pxor m4, m7 834 pxor m6, m7 835 pfadd m0, m4 836 pfadd m2, m6 837%endif 838%else 839 movaps xmm0, [%3+%2*4] 840 movaps xmm1, [%3+%1*4-0x10] 841 movaps xmm2, xmm0 842 shufps xmm0, xmm1, 0x88 843 shufps xmm1, xmm2, 0x77 844 movlps xmm4, [%4+%2*2] 845 movlps xmm5, [%5+%2*2+0x0] 846 movhps xmm4, [%4+%1*2-0x8] 847 movhps xmm5, [%5+%1*2-0x8] 848 movaps xmm2, xmm0 849 movaps xmm3, xmm1 850 mulps xmm0, xmm5 851 mulps xmm1, xmm4 852 mulps xmm2, xmm4 853 mulps xmm3, xmm5 854 subps xmm1, xmm0 855 addps xmm2, xmm3 856 movaps xmm0, xmm1 857 unpcklps xmm1, xmm2 858 unpckhps xmm0, xmm2 859%endif 860%endmacro 861 862%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 863 mulps m6, %3, [%5+%1] 864 mulps m7, %2, [%5+%1] 865 mulps %2, %2, [%6+%1] 866 mulps %3, %3, [%6+%1] 867 subps %2, %2, m6 868 addps %3, %3, m7 869%endmacro 870 871%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8 872.post: 873 vmovaps ymm1, [%3+%1*2] 874 vmovaps ymm0, [%3+%1*2+0x20] 875 vmovaps ymm3, [%3+%2*2] 876 vmovaps ymm2, [%3+%2*2+0x20] 877 878 CMUL %1, ymm0, ymm1, %3, %4, %5 879 CMUL %2, ymm2, ymm3, %3, %4, %5 880 vshufps ymm1, ymm1, ymm1, 0x1b 881 vshufps ymm3, ymm3, ymm3, 0x1b 882 vperm2f128 ymm1, ymm1, ymm1, 0x01 883 vperm2f128 ymm3, ymm3, ymm3, 0x01 884 vunpcklps ymm6, ymm2, ymm1 885 vunpckhps ymm4, ymm2, ymm1 886 vunpcklps ymm7, ymm0, ymm3 887 vunpckhps ymm5, ymm0, ymm3 888 889 vextractf128 [%3+%1*2], ymm7, 0 890 vextractf128 [%3+%1*2+0x10], ymm5, 0 891 vextractf128 [%3+%1*2+0x20], ymm7, 1 892 vextractf128 [%3+%1*2+0x30], ymm5, 1 893 894 vextractf128 [%3+%2*2], ymm6, 0 895 vextractf128 [%3+%2*2+0x10], ymm4, 0 896 vextractf128 [%3+%2*2+0x20], ymm6, 1 897 vextractf128 [%3+%2*2+0x30], ymm4, 1 898 sub %2, 0x20 899 add %1, 0x20 900 jl .post 901%endmacro 902 903%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 904.post: 905 movaps xmm1, [%3+%1*2] 906 movaps xmm0, [%3+%1*2+0x10] 907 CMUL %1, xmm0, xmm1, %3, %4, %5 908 movaps xmm5, [%3+%2*2] 909 movaps xmm4, [%3+%2*2+0x10] 910 CMUL %2, xmm4, xmm5, %3, %4, %5 911 shufps xmm1, xmm1, 0x1b 912 shufps xmm5, xmm5, 0x1b 913 movaps xmm6, xmm4 914 unpckhps xmm4, xmm1 915 unpcklps xmm6, xmm1 916 movaps xmm2, xmm0 917 unpcklps xmm0, xmm5 918 unpckhps xmm2, xmm5 919 movaps [%3+%2*2], xmm6 920 movaps [%3+%2*2+0x10], xmm4 921 movaps [%3+%1*2], xmm0 922 movaps [%3+%1*2+0x10], xmm2 923 sub %2, 0x10 924 add %1, 0x10 925 jl .post 926%endmacro 927 928%macro CMUL_3DNOW 6 929 mova m6, [%1+%2*2] 930 mova %3, [%1+%2*2+8] 931 mova %4, m6 932 mova m7, %3 933 pfmul m6, [%5+%2] 934 pfmul %3, [%6+%2] 935 pfmul %4, [%6+%2] 936 pfmul m7, [%5+%2] 937 pfsub %3, m6 938 pfadd %4, m7 939%endmacro 940 941%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8 942.post: 943 CMUL_3DNOW %3, %1, m0, m1, %4, %5 944 CMUL_3DNOW %3, %2, m2, m3, %4, %5 945 movd [%3+%1*2+ 0], m0 946 movd [%3+%2*2+12], m1 947 movd [%3+%2*2+ 0], m2 948 movd [%3+%1*2+12], m3 949 psrlq m0, 32 950 psrlq m1, 32 951 psrlq m2, 32 952 psrlq m3, 32 953 movd [%3+%1*2+ 8], m0 954 movd [%3+%2*2+ 4], m1 955 movd [%3+%2*2+ 8], m2 956 movd [%3+%1*2+ 4], m3 957 sub %2, 8 958 add %1, 8 959 jl .post 960%endmacro 961 962%macro DECL_IMDCT 1 963cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input 964%if ARCH_X86_64 965%define rrevtab r7 966%define rtcos r8 967%define rtsin r9 968%else 969%define rrevtab r6 970%define rtsin r6 971%define rtcos r5 972%endif 973 mov r3d, [r0+FFTContext.mdctsize] 974 add r2, r3 975 shr r3, 1 976 mov rtcos, [r0+FFTContext.tcos] 977 mov rtsin, [r0+FFTContext.tsin] 978 add rtcos, r3 979 add rtsin, r3 980%if ARCH_X86_64 == 0 981 push rtcos 982 push rtsin 983%endif 984 shr r3, 1 985 mov rrevtab, [r0+FFTContext.revtab] 986 add rrevtab, r3 987%if ARCH_X86_64 == 0 988 push rrevtab 989%endif 990 991%if mmsize == 8 992 sub r3, 2 993%else 994 sub r3, 4 995%endif 996%if ARCH_X86_64 || mmsize == 8 997 xor r4, r4 998 sub r4, r3 999%endif 1000%if notcpuflag(3dnowext) && mmsize == 8 1001 movd m7, [ps_m1m1m1m1] 1002%endif 1003.pre: 1004%if ARCH_X86_64 == 0 1005;unspill 1006%if mmsize != 8 1007 xor r4, r4 1008 sub r4, r3 1009%endif 1010 mov rtcos, [esp+8] 1011 mov rtsin, [esp+4] 1012%endif 1013 1014 PREROTATER r4, r3, r2, rtcos, rtsin 1015%if mmsize == 8 1016 mov r6, [esp] ; rrevtab = ptr+n8 1017 movzx r5, word [rrevtab+r4-2] ; rrevtab[j] 1018 movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1] 1019 mova [r1+r5*8], m0 1020 mova [r1+r6*8], m2 1021 add r4, 2 1022 sub r3, 2 1023%else 1024%if ARCH_X86_64 1025 movzx r5, word [rrevtab+r4-4] 1026 movzx r6, word [rrevtab+r4-2] 1027 movzx r10, word [rrevtab+r3] 1028 movzx r11, word [rrevtab+r3+2] 1029 movlps [r1+r5 *8], xmm0 1030 movhps [r1+r6 *8], xmm0 1031 movlps [r1+r10*8], xmm1 1032 movhps [r1+r11*8], xmm1 1033 add r4, 4 1034%else 1035 mov r6, [esp] 1036 movzx r5, word [r6+r4-4] 1037 movzx r4, word [r6+r4-2] 1038 movlps [r1+r5*8], xmm0 1039 movhps [r1+r4*8], xmm0 1040 movzx r5, word [r6+r3] 1041 movzx r4, word [r6+r3+2] 1042 movlps [r1+r5*8], xmm1 1043 movhps [r1+r4*8], xmm1 1044%endif 1045 sub r3, 4 1046%endif 1047 jns .pre 1048 1049 mov r5, r0 1050 mov r6, r1 1051 mov r0, r1 1052 mov r1d, [r5+FFTContext.nbits] 1053 1054 FFT_DISPATCH SUFFIX, r1 1055 1056 mov r0d, [r5+FFTContext.mdctsize] 1057 add r6, r0 1058 shr r0, 1 1059%if ARCH_X86_64 == 0 1060%define rtcos r2 1061%define rtsin r3 1062 mov rtcos, [esp+8] 1063 mov rtsin, [esp+4] 1064%endif 1065 neg r0 1066 mov r1, -mmsize 1067 sub r1, r0 1068 %1 r0, r1, r6, rtcos, rtsin 1069%if ARCH_X86_64 == 0 1070 add esp, 12 1071%endif 1072%if mmsize == 8 1073 femms 1074%endif 1075 RET 1076%endmacro 1077 1078DECL_IMDCT POSROTATESHUF 1079 1080%if ARCH_X86_32 1081INIT_MMX 3dnow 1082DECL_IMDCT POSROTATESHUF_3DNOW 1083 1084INIT_MMX 3dnowext 1085DECL_IMDCT POSROTATESHUF_3DNOW 1086%endif 1087 1088INIT_YMM avx 1089 1090%if HAVE_AVX_EXTERNAL 1091DECL_IMDCT POSROTATESHUF_AVX 1092%endif 1093