1;****************************************************************************** 2;* FFT transform with SSE/3DNow optimizations 3;* Copyright (c) 2008 Loren Merritt 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22; These functions are not individually interchangeable with the C versions. 23; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results 24; in blocks as conventient to the vector size. 25; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) 26 27%include "x86inc.asm" 28 29SECTION_RODATA 30 31%define M_SQRT1_2 0.70710678118654752440 32ps_root2: times 4 dd M_SQRT1_2 33ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 34ps_m1p1: dd 1<<31, 0 35 36%assign i 16 37%rep 13 38cextern ff_cos_ %+ i 39%assign i i<<1 40%endrep 41 42%ifdef ARCH_X86_64 43 %define pointer dq 44%else 45 %define pointer dd 46%endif 47 48%macro IF0 1+ 49%endmacro 50%macro IF1 1+ 51 %1 52%endmacro 53 54section .text align=16 55 56%macro T2_3DN 4 ; z0, z1, mem0, mem1 57 mova %1, %3 58 mova %2, %1 59 pfadd %1, %4 60 pfsub %2, %4 61%endmacro 62 63%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1 64 mova %5, %3 65 pfsub %3, %4 66 pfadd %5, %4 ; {t6,t5} 67 pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7} 68 mova %6, %1 69 pswapd %3, %3 70 pfadd %1, %5 ; {r0,i0} 71 pfsub %6, %5 ; {r2,i2} 72 mova %4, %2 73 pfadd %2, %3 ; {r1,i1} 74 pfsub %4, %3 ; {r3,i3} 75 SWAP %3, %6 76%endmacro 77 78; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3} 79; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} 80%macro T4_SSE 3 81 mova %3, %1 82 shufps %1, %2, 0x64 ; {r0,i0,r3,i2} 83 shufps %3, %2, 0xce ; {r1,i1,r2,i3} 84 mova %2, %1 85 addps %1, %3 ; {t1,t2,t6,t5} 86 subps %2, %3 ; {t3,t4,t8,t7} 87 mova %3, %1 88 shufps %1, %2, 0x44 ; {t1,t2,t3,t4} 89 shufps %3, %2, 0xbe ; {t6,t5,t7,t8} 90 mova %2, %1 91 addps %1, %3 ; {r0,i0,r1,i1} 92 subps %2, %3 ; {r2,i2,r3,i3} 93 mova %3, %1 94 shufps %1, %2, 0x88 ; {r0,r1,r2,r3} 95 shufps %3, %2, 0xdd ; {i0,i1,i2,i3} 96 SWAP %2, %3 97%endmacro 98 99%macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1 100 mova %5, %3 101 shufps %3, %4, 0x44 ; {r4,i4,r6,i6} 102 shufps %5, %4, 0xee ; {r5,i5,r7,i7} 103 mova %6, %3 104 subps %3, %5 ; {r5,i5,r7,i7} 105 addps %6, %5 ; {t1,t2,t3,t4} 106 mova %5, %3 107 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} 108 mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7} 109 mulps %5, [ps_root2 GLOBAL] 110 addps %3, %5 ; {t8,t7,ta,t9} 111 mova %5, %6 112 shufps %6, %3, 0x36 ; {t3,t2,t9,t8} 113 shufps %5, %3, 0x9c ; {t1,t4,t7,ta} 114 mova %3, %6 115 addps %6, %5 ; {t1,t2,t9,ta} 116 subps %3, %5 ; {t6,t5,tc,tb} 117 mova %5, %6 118 shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} 119 shufps %5, %3, 0x8d ; {t2,ta,t6,tc} 120 mova %3, %1 121 mova %4, %2 122 addps %1, %6 ; {r0,r1,r2,r3} 123 addps %2, %5 ; {i0,i1,i2,i3} 124 subps %3, %6 ; {r4,r5,r6,r7} 125 subps %4, %5 ; {i4,i5,i6,i7} 126%endmacro 127 128; scheduled for cpu-bound sizes 129%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim 130IF%1 mova m4, Z(4) 131IF%1 mova m5, Z(5) 132 mova m0, %2 ; wre 133 mova m2, m4 134 mova m1, %3 ; wim 135 mova m3, m5 136 mulps m2, m0 ; r2*wre 137IF%1 mova m6, Z(6) 138 mulps m3, m1 ; i2*wim 139IF%1 mova m7, Z(7) 140 mulps m4, m1 ; r2*wim 141 mulps m5, m0 ; i2*wre 142 addps m2, m3 ; r2*wre + i2*wim 143 mova m3, m1 144 mulps m1, m6 ; r3*wim 145 subps m5, m4 ; i2*wre - r2*wim 146 mova m4, m0 147 mulps m3, m7 ; i3*wim 148 mulps m4, m6 ; r3*wre 149 mulps m0, m7 ; i3*wre 150 subps m4, m3 ; r3*wre - i3*wim 151 mova m3, Z(0) 152 addps m0, m1 ; i3*wre + r3*wim 153 mova m1, m4 154 addps m4, m2 ; t5 155 subps m1, m2 ; t3 156 subps m3, m4 ; r2 157 addps m4, Z(0) ; r0 158 mova m6, Z(2) 159 mova Z(4), m3 160 mova Z(0), m4 161 mova m3, m5 162 subps m5, m0 ; t4 163 mova m4, m6 164 subps m6, m5 ; r3 165 addps m5, m4 ; r1 166 mova Z(6), m6 167 mova Z(2), m5 168 mova m2, Z(3) 169 addps m3, m0 ; t6 170 subps m2, m1 ; i3 171 mova m7, Z(1) 172 addps m1, Z(3) ; i1 173 mova Z(7), m2 174 mova Z(3), m1 175 mova m4, m7 176 subps m7, m3 ; i2 177 addps m3, m4 ; i0 178 mova Z(5), m7 179 mova Z(1), m3 180%endmacro 181 182; scheduled to avoid store->load aliasing 183%macro PASS_BIG 1 ; (!interleave) 184 mova m4, Z(4) ; r2 185 mova m5, Z(5) ; i2 186 mova m2, m4 187 mova m0, [wq] ; wre 188 mova m3, m5 189 mova m1, [wq+o1q] ; wim 190 mulps m2, m0 ; r2*wre 191 mova m6, Z(6) ; r3 192 mulps m3, m1 ; i2*wim 193 mova m7, Z(7) ; i3 194 mulps m4, m1 ; r2*wim 195 mulps m5, m0 ; i2*wre 196 addps m2, m3 ; r2*wre + i2*wim 197 mova m3, m1 198 mulps m1, m6 ; r3*wim 199 subps m5, m4 ; i2*wre - r2*wim 200 mova m4, m0 201 mulps m3, m7 ; i3*wim 202 mulps m4, m6 ; r3*wre 203 mulps m0, m7 ; i3*wre 204 subps m4, m3 ; r3*wre - i3*wim 205 mova m3, Z(0) 206 addps m0, m1 ; i3*wre + r3*wim 207 mova m1, m4 208 addps m4, m2 ; t5 209 subps m1, m2 ; t3 210 subps m3, m4 ; r2 211 addps m4, Z(0) ; r0 212 mova m6, Z(2) 213 mova Z(4), m3 214 mova Z(0), m4 215 mova m3, m5 216 subps m5, m0 ; t4 217 mova m4, m6 218 subps m6, m5 ; r3 219 addps m5, m4 ; r1 220IF%1 mova Z(6), m6 221IF%1 mova Z(2), m5 222 mova m2, Z(3) 223 addps m3, m0 ; t6 224 subps m2, m1 ; i3 225 mova m7, Z(1) 226 addps m1, Z(3) ; i1 227IF%1 mova Z(7), m2 228IF%1 mova Z(3), m1 229 mova m4, m7 230 subps m7, m3 ; i2 231 addps m3, m4 ; i0 232IF%1 mova Z(5), m7 233IF%1 mova Z(1), m3 234%if %1==0 235 mova m4, m5 ; r1 236 mova m0, m6 ; r3 237 unpcklps m5, m1 238 unpckhps m4, m1 239 unpcklps m6, m2 240 unpckhps m0, m2 241 mova m1, Z(0) 242 mova m2, Z(4) 243 mova Z(2), m5 244 mova Z(3), m4 245 mova Z(6), m6 246 mova Z(7), m0 247 mova m5, m1 ; r0 248 mova m4, m2 ; r2 249 unpcklps m1, m3 250 unpckhps m5, m3 251 unpcklps m2, m7 252 unpckhps m4, m7 253 mova Z(0), m1 254 mova Z(1), m5 255 mova Z(4), m2 256 mova Z(5), m4 257%endif 258%endmacro 259 260%macro PUNPCK 3 261 mova %3, %1 262 punpckldq %1, %2 263 punpckhdq %3, %2 264%endmacro 265 266INIT_XMM 267 268%define Z(x) [r0+mmsize*x] 269 270align 16 271fft4_sse: 272 mova m0, Z(0) 273 mova m1, Z(1) 274 T4_SSE m0, m1, m2 275 mova Z(0), m0 276 mova Z(1), m1 277 ret 278 279align 16 280fft8_sse: 281 mova m0, Z(0) 282 mova m1, Z(1) 283 T4_SSE m0, m1, m2 284 mova m2, Z(2) 285 mova m3, Z(3) 286 T8_SSE m0, m1, m2, m3, m4, m5 287 mova Z(0), m0 288 mova Z(1), m1 289 mova Z(2), m2 290 mova Z(3), m3 291 ret 292 293align 16 294fft16_sse: 295 mova m0, Z(0) 296 mova m1, Z(1) 297 T4_SSE m0, m1, m2 298 mova m2, Z(2) 299 mova m3, Z(3) 300 T8_SSE m0, m1, m2, m3, m4, m5 301 mova m4, Z(4) 302 mova m5, Z(5) 303 mova Z(0), m0 304 mova Z(1), m1 305 mova Z(2), m2 306 mova Z(3), m3 307 T4_SSE m4, m5, m6 308 mova m6, Z(6) 309 mova m7, Z(7) 310 T4_SSE m6, m7, m0 311 PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL] 312 ret 313 314 315INIT_MMX 316 317%macro FFT48_3DN 1 318align 16 319fft4%1: 320 T2_3DN m0, m1, Z(0), Z(1) 321 mova m2, Z(2) 322 mova m3, Z(3) 323 T4_3DN m0, m1, m2, m3, m4, m5 324 PUNPCK m0, m1, m4 325 PUNPCK m2, m3, m5 326 mova Z(0), m0 327 mova Z(1), m4 328 mova Z(2), m2 329 mova Z(3), m5 330 ret 331 332align 16 333fft8%1: 334 T2_3DN m0, m1, Z(0), Z(1) 335 mova m2, Z(2) 336 mova m3, Z(3) 337 T4_3DN m0, m1, m2, m3, m4, m5 338 mova Z(0), m0 339 mova Z(2), m2 340 T2_3DN m4, m5, Z(4), Z(5) 341 T2_3DN m6, m7, Z(6), Z(7) 342 pswapd m0, m5 343 pswapd m2, m7 344 pxor m0, [ps_m1p1 GLOBAL] 345 pxor m2, [ps_m1p1 GLOBAL] 346 pfsub m5, m0 347 pfadd m7, m2 348 pfmul m5, [ps_root2 GLOBAL] 349 pfmul m7, [ps_root2 GLOBAL] 350 T4_3DN m1, m3, m5, m7, m0, m2 351 mova Z(5), m5 352 mova Z(7), m7 353 mova m0, Z(0) 354 mova m2, Z(2) 355 T4_3DN m0, m2, m4, m6, m5, m7 356 PUNPCK m0, m1, m5 357 PUNPCK m2, m3, m7 358 mova Z(0), m0 359 mova Z(1), m5 360 mova Z(2), m2 361 mova Z(3), m7 362 PUNPCK m4, Z(5), m5 363 PUNPCK m6, Z(7), m7 364 mova Z(4), m4 365 mova Z(5), m5 366 mova Z(6), m6 367 mova Z(7), m7 368 ret 369%endmacro 370 371FFT48_3DN _3dn2 372 373%macro pswapd 2 374%ifidn %1, %2 375 movd [r0+12], %1 376 punpckhdq %1, [r0+8] 377%else 378 movq %1, %2 379 psrlq %1, 32 380 punpckldq %1, %2 381%endif 382%endmacro 383 384FFT48_3DN _3dn 385 386 387%define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)] 388 389%macro DECL_PASS 2+ ; name, payload 390align 16 391%1: 392DEFINE_ARGS z, w, n, o1, o3 393 lea o3q, [nq*3] 394 lea o1q, [nq*8] 395 shl o3q, 4 396.loop: 397 %2 398 add zq, mmsize*2 399 add wq, mmsize 400 sub nd, mmsize/8 401 jg .loop 402 rep ret 403%endmacro 404 405INIT_XMM 406DECL_PASS pass_sse, PASS_BIG 1 407DECL_PASS pass_interleave_sse, PASS_BIG 0 408 409INIT_MMX 410%define mulps pfmul 411%define addps pfadd 412%define subps pfsub 413%define unpcklps punpckldq 414%define unpckhps punpckhdq 415DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q] 416DECL_PASS pass_interleave_3dn, PASS_BIG 0 417%define pass_3dn2 pass_3dn 418%define pass_interleave_3dn2 pass_interleave_3dn 419 420 421%macro DECL_FFT 2-3 ; nbits, cpu, suffix 422%xdefine list_of_fft fft4%2, fft8%2 423%if %1==5 424%xdefine list_of_fft list_of_fft, fft16%2 425%endif 426 427%assign n 1<<%1 428%rep 17-%1 429%assign n2 n/2 430%assign n4 n/4 431%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 432 433align 16 434fft %+ n %+ %3%2: 435 call fft %+ n2 %+ %2 436 add r0, n*4 - (n&(-2<<%1)) 437 call fft %+ n4 %+ %2 438 add r0, n*2 - (n2&(-2<<%1)) 439 call fft %+ n4 %+ %2 440 sub r0, n*6 + (n2&(-2<<%1)) 441 lea r1, [ff_cos_ %+ n GLOBAL] 442 mov r2d, n4/2 443 jmp pass%3%2 444 445%assign n n*2 446%endrep 447%undef n 448 449%ifidn __OUTPUT_FORMAT__,macho64 450section .rodata 451%endif 452 453align 8 454dispatch_tab%3%2: pointer list_of_fft 455 456section .text 457 458; On x86_32, this function does the register saving and restoring for all of fft. 459; The others pass args in registers and don't spill anything. 460cglobal fft_dispatch%3%2, 2,5,0, z, nbits 461 lea r2, [dispatch_tab%3%2 GLOBAL] 462 mov r2, [r2 + (nbitsq-2)*gprsize] 463 call r2 464 RET 465%endmacro ; DECL_FFT 466 467DECL_FFT 5, _sse 468DECL_FFT 5, _sse, _interleave 469DECL_FFT 4, _3dn 470DECL_FFT 4, _3dn, _interleave 471DECL_FFT 4, _3dn2 472DECL_FFT 4, _3dn2, _interleave 473 474