1;****************************************************************************** 2;* 32 point SSE-optimized DCT transform 3;* Copyright (c) 2010 Vitor Sessak 4;* 5;* This file is part of Libav. 6;* 7;* Libav is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* Libav is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with Libav; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "x86inc.asm" 23%include "x86util.asm" 24 25SECTION_RODATA 32 26 27align 32 28ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 29 dd 0.553104, 0.582935, 0.622504, 0.674808 30 dd -10.190008, -3.407609, -2.057781, -1.484165 31 dd -1.169440, -0.972568, -0.839350, -0.744536 32 dd 0.502419, 0.522499, 0.566944, 0.646822 33 dd 0.788155, 1.060678, 1.722447, 5.101149 34 dd 0.509796, 0.601345, 0.899976, 2.562916 35 dd 0.509796, 0.601345, 0.899976, 2.562916 36 dd 1.000000, 1.000000, 1.306563, 0.541196 37 dd 1.000000, 1.000000, 1.306563, 0.541196 38 dd 1.000000, 0.707107, 1.000000, -0.707107 39 dd 1.000000, 0.707107, 1.000000, -0.707107 40 dd 0.707107, 0.707107, 0.707107, 0.707107 41 42align 32 43ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 44 45%macro BUTTERFLY_SSE 4 46 movaps %4, %1 47 subps %1, %2 48 addps %2, %4 49 mulps %1, %3 50%endmacro 51 52%macro BUTTERFLY_AVX 4 53 vsubps %4, %1, %2 54 vaddps %2, %2, %1 55 vmulps %1, %4, %3 56%endmacro 57 58%macro BUTTERFLY0_SSE 5 59 movaps %4, %1 60 shufps %1, %1, %5 61 xorps %4, %2 62 addps %1, %4 63 mulps %1, %3 64%endmacro 65 66%macro BUTTERFLY0_SSE2 5 67 pshufd %4, %1, %5 68 xorps %1, %2 69 addps %1, %4 70 mulps %1, %3 71%endmacro 72 73%macro BUTTERFLY0_AVX 5 74 vshufps %4, %1, %1, %5 75 vxorps %1, %1, %2 76 vaddps %4, %4, %1 77 vmulps %1, %4, %3 78%endmacro 79 80%macro BUTTERFLY2 4 81 BUTTERFLY0 %1, %2, %3, %4, 0x1b 82%endmacro 83 84%macro BUTTERFLY3 4 85 BUTTERFLY0 %1, %2, %3, %4, 0xb1 86%endmacro 87 88%macro BUTTERFLY3V 5 89 movaps m%5, m%1 90 addps m%1, m%2 91 subps m%5, m%2 92 SWAP %2, %5 93 mulps m%2, [ps_cos_vec+192] 94 movaps m%5, m%3 95 addps m%3, m%4 96 subps m%4, m%5 97 mulps m%4, [ps_cos_vec+192] 98%endmacro 99 100%macro PASS6_AND_PERMUTE 0 101 mov tmpd, [outq+4] 102 movss m7, [outq+72] 103 addss m7, [outq+76] 104 movss m3, [outq+56] 105 addss m3, [outq+60] 106 addss m4, m3 107 movss m2, [outq+52] 108 addss m2, m3 109 movss m3, [outq+104] 110 addss m3, [outq+108] 111 addss m1, m3 112 addss m5, m4 113 movss [outq+ 16], m1 114 movss m1, [outq+100] 115 addss m1, m3 116 movss m3, [outq+40] 117 movss [outq+ 48], m1 118 addss m3, [outq+44] 119 movss m1, [outq+100] 120 addss m4, m3 121 addss m3, m2 122 addss m1, [outq+108] 123 movss [outq+ 40], m3 124 addss m2, [outq+36] 125 movss m3, [outq+8] 126 movss [outq+ 56], m2 127 addss m3, [outq+12] 128 movss [outq+ 32], m3 129 movss m3, [outq+80] 130 movss [outq+ 8], m5 131 movss [outq+ 80], m1 132 movss m2, [outq+52] 133 movss m5, [outq+120] 134 addss m5, [outq+124] 135 movss m1, [outq+64] 136 addss m2, [outq+60] 137 addss m0, m5 138 addss m5, [outq+116] 139 mov [outq+64], tmpd 140 addss m6, m0 141 addss m1, m6 142 mov tmpd, [outq+12] 143 mov [outq+ 96], tmpd 144 movss [outq+ 4], m1 145 movss m1, [outq+24] 146 movss [outq+ 24], m4 147 movss m4, [outq+88] 148 addss m4, [outq+92] 149 addss m3, m4 150 addss m4, [outq+84] 151 mov tmpd, [outq+108] 152 addss m1, [outq+28] 153 addss m0, m1 154 addss m1, m5 155 addss m6, m3 156 addss m3, m0 157 addss m0, m7 158 addss m5, [outq+20] 159 addss m7, m1 160 movss [outq+ 12], m6 161 mov [outq+112], tmpd 162 movss m6, [outq+28] 163 movss [outq+ 28], m0 164 movss m0, [outq+36] 165 movss [outq+ 36], m7 166 addss m1, m4 167 movss m7, [outq+116] 168 addss m0, m2 169 addss m7, [outq+124] 170 movss [outq+ 72], m0 171 movss m0, [outq+44] 172 addss m2, m0 173 movss [outq+ 44], m1 174 movss [outq+ 88], m2 175 addss m0, [outq+60] 176 mov tmpd, [outq+60] 177 mov [outq+120], tmpd 178 movss [outq+104], m0 179 addss m4, m5 180 addss m5, [outq+68] 181 movss [outq+52], m4 182 movss [outq+60], m5 183 movss m4, [outq+68] 184 movss m5, [outq+20] 185 movss [outq+ 20], m3 186 addss m5, m7 187 addss m7, m6 188 addss m4, m5 189 movss m2, [outq+84] 190 addss m2, [outq+92] 191 addss m5, m2 192 movss [outq+ 68], m4 193 addss m2, m7 194 movss m4, [outq+76] 195 movss [outq+ 84], m2 196 movss [outq+ 76], m5 197 addss m7, m4 198 addss m6, [outq+124] 199 addss m4, m6 200 addss m6, [outq+92] 201 movss [outq+100], m4 202 movss [outq+108], m6 203 movss m6, [outq+92] 204 movss [outq+92], m7 205 addss m6, [outq+124] 206 movss [outq+116], m6 207%endmacro 208 209%define BUTTERFLY BUTTERFLY_AVX 210%define BUTTERFLY0 BUTTERFLY0_AVX 211 212INIT_YMM 213SECTION_TEXT 214%ifdef HAVE_AVX 215; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) 216cglobal dct32_float_avx, 2,3,8, out, in, tmp 217 ; pass 1 218 vmovaps m4, [inq+0] 219 vinsertf128 m5, m5, [inq+96], 1 220 vinsertf128 m5, m5, [inq+112], 0 221 vshufps m5, m5, m5, 0x1b 222 BUTTERFLY m4, m5, [ps_cos_vec], m6 223 224 vmovaps m2, [inq+64] 225 vinsertf128 m6, m6, [inq+32], 1 226 vinsertf128 m6, m6, [inq+48], 0 227 vshufps m6, m6, m6, 0x1b 228 BUTTERFLY m2, m6, [ps_cos_vec+32], m0 229 230 ; pass 2 231 232 BUTTERFLY m5, m6, [ps_cos_vec+64], m0 233 BUTTERFLY m4, m2, [ps_cos_vec+64], m7 234 235 236 ; pass 3 237 vperm2f128 m3, m6, m4, 0x31 238 vperm2f128 m1, m6, m4, 0x20 239 vshufps m3, m3, m3, 0x1b 240 241 BUTTERFLY m1, m3, [ps_cos_vec+96], m6 242 243 244 vperm2f128 m4, m5, m2, 0x20 245 vperm2f128 m5, m5, m2, 0x31 246 vshufps m5, m5, m5, 0x1b 247 248 BUTTERFLY m4, m5, [ps_cos_vec+96], m6 249 250 ; pass 4 251 vmovaps m6, [ps_p1p1m1m1+0] 252 vmovaps m2, [ps_cos_vec+128] 253 254 BUTTERFLY2 m5, m6, m2, m7 255 BUTTERFLY2 m4, m6, m2, m7 256 BUTTERFLY2 m1, m6, m2, m7 257 BUTTERFLY2 m3, m6, m2, m7 258 259 260 ; pass 5 261 vshufps m6, m6, m6, 0xcc 262 vmovaps m2, [ps_cos_vec+160] 263 264 BUTTERFLY3 m5, m6, m2, m7 265 BUTTERFLY3 m4, m6, m2, m7 266 BUTTERFLY3 m1, m6, m2, m7 267 BUTTERFLY3 m3, m6, m2, m7 268 269 vperm2f128 m6, m3, m3, 0x31 270 vmovaps [outq], m3 271 272 vextractf128 [outq+64], m5, 1 273 vextractf128 [outq+32], m5, 0 274 275 vextractf128 [outq+80], m4, 1 276 vextractf128 [outq+48], m4, 0 277 278 vperm2f128 m0, m1, m1, 0x31 279 vmovaps [outq+96], m1 280 281 vzeroupper 282 283 ; pass 6, no SIMD... 284INIT_XMM 285 PASS6_AND_PERMUTE 286 RET 287%endif 288 289%define BUTTERFLY BUTTERFLY_SSE 290%define BUTTERFLY0 BUTTERFLY0_SSE 291 292%ifdef ARCH_X86_64 293%define SPILL SWAP 294%define UNSPILL SWAP 295 296%macro PASS5 0 297 nop ; FIXME code alignment 298 SWAP 5, 8 299 SWAP 4, 12 300 SWAP 6, 14 301 SWAP 7, 13 302 SWAP 0, 15 303 PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13 304 TRANSPOSE4x4PS 8, 9, 10, 11, 0 305 BUTTERFLY3V 8, 9, 10, 11, 0 306 addps m10, m11 307 TRANSPOSE4x4PS 12, 13, 14, 15, 0 308 BUTTERFLY3V 12, 13, 14, 15, 0 309 addps m14, m15 310 addps m12, m14 311 addps m14, m13 312 addps m13, m15 313%endmacro 314 315%macro PASS6 0 316 SWAP 9, 12 317 SWAP 11, 14 318 movss [outq+0x00], m8 319 pshuflw m0, m8, 0xe 320 movss [outq+0x10], m9 321 pshuflw m1, m9, 0xe 322 movss [outq+0x20], m10 323 pshuflw m2, m10, 0xe 324 movss [outq+0x30], m11 325 pshuflw m3, m11, 0xe 326 movss [outq+0x40], m12 327 pshuflw m4, m12, 0xe 328 movss [outq+0x50], m13 329 pshuflw m5, m13, 0xe 330 movss [outq+0x60], m14 331 pshuflw m6, m14, 0xe 332 movaps [outq+0x70], m15 333 pshuflw m7, m15, 0xe 334 addss m0, m1 335 addss m1, m2 336 movss [outq+0x08], m0 337 addss m2, m3 338 movss [outq+0x18], m1 339 addss m3, m4 340 movss [outq+0x28], m2 341 addss m4, m5 342 movss [outq+0x38], m3 343 addss m5, m6 344 movss [outq+0x48], m4 345 addss m6, m7 346 movss [outq+0x58], m5 347 movss [outq+0x68], m6 348 movss [outq+0x78], m7 349 350 PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7 351 movhlps m0, m1 352 pshufd m1, m1, 3 353 SWAP 0, 2, 4, 6, 8, 10, 12, 14 354 SWAP 1, 3, 5, 7, 9, 11, 13, 15 355%rep 7 356 movhlps m0, m1 357 pshufd m1, m1, 3 358 addss m15, m1 359 SWAP 0, 2, 4, 6, 8, 10, 12, 14 360 SWAP 1, 3, 5, 7, 9, 11, 13, 15 361%endrep 362%assign i 4 363%rep 15 364 addss m0, m1 365 movss [outq+i], m0 366 SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 367 %assign i i+8 368%endrep 369%endmacro 370 371%else ; ARCH_X86_32 372%macro SPILL 2 ; xmm#, mempos 373 movaps [outq+(%2-8)*16], m%1 374%endmacro 375%macro UNSPILL 2 376 movaps m%1, [outq+(%2-8)*16] 377%endmacro 378 379%define PASS6 PASS6_AND_PERMUTE 380%macro PASS5 0 381 movaps m2, [ps_cos_vec+160] 382 shufps m3, m3, 0xcc 383 384 BUTTERFLY3 m5, m3, m2, m1 385 SPILL 5, 8 386 387 UNSPILL 1, 9 388 BUTTERFLY3 m1, m3, m2, m5 389 SPILL 1, 14 390 391 BUTTERFLY3 m4, m3, m2, m5 392 SPILL 4, 12 393 394 BUTTERFLY3 m7, m3, m2, m5 395 SPILL 7, 13 396 397 UNSPILL 5, 10 398 BUTTERFLY3 m5, m3, m2, m7 399 SPILL 5, 10 400 401 UNSPILL 4, 11 402 BUTTERFLY3 m4, m3, m2, m7 403 SPILL 4, 11 404 405 BUTTERFLY3 m6, m3, m2, m7 406 SPILL 6, 9 407 408 BUTTERFLY3 m0, m3, m2, m7 409 SPILL 0, 15 410%endmacro 411%endif 412 413 414INIT_XMM 415%macro DCT32_FUNC 1 416; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) 417cglobal dct32_float_%1, 2,3,16, out, in, tmp 418 ; pass 1 419 420 movaps m0, [inq+0] 421 LOAD_INV m1, [inq+112] 422 BUTTERFLY m0, m1, [ps_cos_vec], m3 423 424 movaps m7, [inq+64] 425 LOAD_INV m4, [inq+48] 426 BUTTERFLY m7, m4, [ps_cos_vec+32], m3 427 428 ; pass 2 429 movaps m2, [ps_cos_vec+64] 430 BUTTERFLY m1, m4, m2, m3 431 SPILL 1, 11 432 SPILL 4, 8 433 434 ; pass 1 435 movaps m1, [inq+16] 436 LOAD_INV m6, [inq+96] 437 BUTTERFLY m1, m6, [ps_cos_vec+16], m3 438 439 movaps m4, [inq+80] 440 LOAD_INV m5, [inq+32] 441 BUTTERFLY m4, m5, [ps_cos_vec+48], m3 442 443 ; pass 2 444 BUTTERFLY m0, m7, m2, m3 445 446 movaps m2, [ps_cos_vec+80] 447 BUTTERFLY m6, m5, m2, m3 448 449 BUTTERFLY m1, m4, m2, m3 450 451 ; pass 3 452 movaps m2, [ps_cos_vec+96] 453 shufps m1, m1, 0x1b 454 BUTTERFLY m0, m1, m2, m3 455 SPILL 0, 15 456 SPILL 1, 14 457 458 UNSPILL 0, 8 459 shufps m5, m5, 0x1b 460 BUTTERFLY m0, m5, m2, m3 461 462 UNSPILL 1, 11 463 shufps m6, m6, 0x1b 464 BUTTERFLY m1, m6, m2, m3 465 SPILL 1, 11 466 467 shufps m4, m4, 0x1b 468 BUTTERFLY m7, m4, m2, m3 469 470 ; pass 4 471 movaps m3, [ps_p1p1m1m1+0] 472 movaps m2, [ps_cos_vec+128] 473 474 BUTTERFLY2 m5, m3, m2, m1 475 476 BUTTERFLY2 m0, m3, m2, m1 477 SPILL 0, 9 478 479 BUTTERFLY2 m6, m3, m2, m1 480 SPILL 6, 10 481 482 UNSPILL 0, 11 483 BUTTERFLY2 m0, m3, m2, m1 484 SPILL 0, 11 485 486 BUTTERFLY2 m4, m3, m2, m1 487 488 BUTTERFLY2 m7, m3, m2, m1 489 490 UNSPILL 6, 14 491 BUTTERFLY2 m6, m3, m2, m1 492 493 UNSPILL 0, 15 494 BUTTERFLY2 m0, m3, m2, m1 495 496 PASS5 497 PASS6 498 RET 499%endmacro 500 501%macro LOAD_INV_SSE 2 502 movaps %1, %2 503 shufps %1, %1, 0x1b 504%endmacro 505 506%define LOAD_INV LOAD_INV_SSE 507DCT32_FUNC sse 508 509%macro LOAD_INV_SSE2 2 510 pshufd %1, %2, 0x1b 511%endmacro 512 513%define LOAD_INV LOAD_INV_SSE2 514%define BUTTERFLY0 BUTTERFLY0_SSE2 515DCT32_FUNC sse2 516