1;****************************************************************************** 2;* 32 point SSE-optimized DCT transform 3;* Copyright (c) 2010 Vitor Sessak 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 32 25 26align 32 27ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 28 dd 0.553104, 0.582935, 0.622504, 0.674808 29 dd -10.190008, -3.407609, -2.057781, -1.484165 30 dd -1.169440, -0.972568, -0.839350, -0.744536 31 dd 0.502419, 0.522499, 0.566944, 0.646822 32 dd 0.788155, 1.060678, 1.722447, 5.101149 33 dd 0.509796, 0.601345, 0.899976, 2.562916 34 dd 0.509796, 0.601345, 0.899976, 2.562916 35 dd 1.000000, 1.000000, 1.306563, 0.541196 36 dd 1.000000, 1.000000, 1.306563, 0.541196 37 dd 1.000000, 0.707107, 1.000000, -0.707107 38 dd 1.000000, 0.707107, 1.000000, -0.707107 39 dd 0.707107, 0.707107, 0.707107, 0.707107 40 41align 32 42ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 43 44%macro BUTTERFLY 4 45 subps %4, %1, %2 46 addps %2, %2, %1 47 mulps %1, %4, %3 48%endmacro 49 50%macro BUTTERFLY0 5 51%if cpuflag(sse2) && notcpuflag(avx) 52 pshufd %4, %1, %5 53 xorps %1, %2 54 addps %1, %4 55 mulps %1, %3 56%else 57 shufps %4, %1, %1, %5 58 xorps %1, %1, %2 59 addps %4, %4, %1 60 mulps %1, %4, %3 61%endif 62%endmacro 63 64%macro BUTTERFLY2 4 65 BUTTERFLY0 %1, %2, %3, %4, 0x1b 66%endmacro 67 68%macro BUTTERFLY3 4 69 BUTTERFLY0 %1, %2, %3, %4, 0xb1 70%endmacro 71 72%macro BUTTERFLY3V 5 73 movaps m%5, m%1 74 addps m%1, m%2 75 subps m%5, m%2 76 SWAP %2, %5 77 mulps m%2, [ps_cos_vec+192] 78 movaps m%5, m%3 79 addps m%3, m%4 80 subps m%4, m%5 81 mulps m%4, [ps_cos_vec+192] 82%endmacro 83 84%macro PASS6_AND_PERMUTE 0 85 mov tmpd, [outq+4] 86 movss m7, [outq+72] 87 addss m7, [outq+76] 88 movss m3, [outq+56] 89 addss m3, [outq+60] 90 addss m4, m3 91 movss m2, [outq+52] 92 addss m2, m3 93 movss m3, [outq+104] 94 addss m3, [outq+108] 95 addss m1, m3 96 addss m5, m4 97 movss [outq+ 16], m1 98 movss m1, [outq+100] 99 addss m1, m3 100 movss m3, [outq+40] 101 movss [outq+ 48], m1 102 addss m3, [outq+44] 103 movss m1, [outq+100] 104 addss m4, m3 105 addss m3, m2 106 addss m1, [outq+108] 107 movss [outq+ 40], m3 108 addss m2, [outq+36] 109 movss m3, [outq+8] 110 movss [outq+ 56], m2 111 addss m3, [outq+12] 112 movss [outq+ 32], m3 113 movss m3, [outq+80] 114 movss [outq+ 8], m5 115 movss [outq+ 80], m1 116 movss m2, [outq+52] 117 movss m5, [outq+120] 118 addss m5, [outq+124] 119 movss m1, [outq+64] 120 addss m2, [outq+60] 121 addss m0, m5 122 addss m5, [outq+116] 123 mov [outq+64], tmpd 124 addss m6, m0 125 addss m1, m6 126 mov tmpd, [outq+12] 127 mov [outq+ 96], tmpd 128 movss [outq+ 4], m1 129 movss m1, [outq+24] 130 movss [outq+ 24], m4 131 movss m4, [outq+88] 132 addss m4, [outq+92] 133 addss m3, m4 134 addss m4, [outq+84] 135 mov tmpd, [outq+108] 136 addss m1, [outq+28] 137 addss m0, m1 138 addss m1, m5 139 addss m6, m3 140 addss m3, m0 141 addss m0, m7 142 addss m5, [outq+20] 143 addss m7, m1 144 movss [outq+ 12], m6 145 mov [outq+112], tmpd 146 movss m6, [outq+28] 147 movss [outq+ 28], m0 148 movss m0, [outq+36] 149 movss [outq+ 36], m7 150 addss m1, m4 151 movss m7, [outq+116] 152 addss m0, m2 153 addss m7, [outq+124] 154 movss [outq+ 72], m0 155 movss m0, [outq+44] 156 addss m2, m0 157 movss [outq+ 44], m1 158 movss [outq+ 88], m2 159 addss m0, [outq+60] 160 mov tmpd, [outq+60] 161 mov [outq+120], tmpd 162 movss [outq+104], m0 163 addss m4, m5 164 addss m5, [outq+68] 165 movss [outq+52], m4 166 movss [outq+60], m5 167 movss m4, [outq+68] 168 movss m5, [outq+20] 169 movss [outq+ 20], m3 170 addss m5, m7 171 addss m7, m6 172 addss m4, m5 173 movss m2, [outq+84] 174 addss m2, [outq+92] 175 addss m5, m2 176 movss [outq+ 68], m4 177 addss m2, m7 178 movss m4, [outq+76] 179 movss [outq+ 84], m2 180 movss [outq+ 76], m5 181 addss m7, m4 182 addss m6, [outq+124] 183 addss m4, m6 184 addss m6, [outq+92] 185 movss [outq+100], m4 186 movss [outq+108], m6 187 movss m6, [outq+92] 188 movss [outq+92], m7 189 addss m6, [outq+124] 190 movss [outq+116], m6 191%endmacro 192 193INIT_YMM avx 194SECTION_TEXT 195%if HAVE_AVX_EXTERNAL 196; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) 197cglobal dct32_float, 2,3,8, out, in, tmp 198 ; pass 1 199 vmovaps m4, [inq+0] 200 vinsertf128 m5, m5, [inq+96], 1 201 vinsertf128 m5, m5, [inq+112], 0 202 vshufps m5, m5, m5, 0x1b 203 BUTTERFLY m4, m5, [ps_cos_vec], m6 204 205 vmovaps m2, [inq+64] 206 vinsertf128 m6, m6, [inq+32], 1 207 vinsertf128 m6, m6, [inq+48], 0 208 vshufps m6, m6, m6, 0x1b 209 BUTTERFLY m2, m6, [ps_cos_vec+32], m0 210 211 ; pass 2 212 213 BUTTERFLY m5, m6, [ps_cos_vec+64], m0 214 BUTTERFLY m4, m2, [ps_cos_vec+64], m7 215 216 217 ; pass 3 218 vperm2f128 m3, m6, m4, 0x31 219 vperm2f128 m1, m6, m4, 0x20 220 vshufps m3, m3, m3, 0x1b 221 222 BUTTERFLY m1, m3, [ps_cos_vec+96], m6 223 224 225 vperm2f128 m4, m5, m2, 0x20 226 vperm2f128 m5, m5, m2, 0x31 227 vshufps m5, m5, m5, 0x1b 228 229 BUTTERFLY m4, m5, [ps_cos_vec+96], m6 230 231 ; pass 4 232 vmovaps m6, [ps_p1p1m1m1+0] 233 vmovaps m2, [ps_cos_vec+128] 234 235 BUTTERFLY2 m5, m6, m2, m7 236 BUTTERFLY2 m4, m6, m2, m7 237 BUTTERFLY2 m1, m6, m2, m7 238 BUTTERFLY2 m3, m6, m2, m7 239 240 241 ; pass 5 242 vshufps m6, m6, m6, 0xcc 243 vmovaps m2, [ps_cos_vec+160] 244 245 BUTTERFLY3 m5, m6, m2, m7 246 BUTTERFLY3 m4, m6, m2, m7 247 BUTTERFLY3 m1, m6, m2, m7 248 BUTTERFLY3 m3, m6, m2, m7 249 250 vperm2f128 m6, m3, m3, 0x31 251 vmovaps [outq], m3 252 253 vextractf128 [outq+64], m5, 1 254 vextractf128 [outq+32], m5, 0 255 256 vextractf128 [outq+80], m4, 1 257 vextractf128 [outq+48], m4, 0 258 259 vperm2f128 m0, m1, m1, 0x31 260 vmovaps [outq+96], m1 261 262 vzeroupper 263 264 ; pass 6, no SIMD... 265INIT_XMM 266 PASS6_AND_PERMUTE 267 RET 268%endif 269 270%if ARCH_X86_64 271%define SPILL SWAP 272%define UNSPILL SWAP 273 274%macro PASS5 0 275 nop ; FIXME code alignment 276 SWAP 5, 8 277 SWAP 4, 12 278 SWAP 6, 14 279 SWAP 7, 13 280 SWAP 0, 15 281 PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13 282 TRANSPOSE4x4PS 8, 9, 10, 11, 0 283 BUTTERFLY3V 8, 9, 10, 11, 0 284 addps m10, m11 285 TRANSPOSE4x4PS 12, 13, 14, 15, 0 286 BUTTERFLY3V 12, 13, 14, 15, 0 287 addps m14, m15 288 addps m12, m14 289 addps m14, m13 290 addps m13, m15 291%endmacro 292 293%macro PASS6 0 294 SWAP 9, 12 295 SWAP 11, 14 296 movss [outq+0x00], m8 297 pshuflw m0, m8, 0xe 298 movss [outq+0x10], m9 299 pshuflw m1, m9, 0xe 300 movss [outq+0x20], m10 301 pshuflw m2, m10, 0xe 302 movss [outq+0x30], m11 303 pshuflw m3, m11, 0xe 304 movss [outq+0x40], m12 305 pshuflw m4, m12, 0xe 306 movss [outq+0x50], m13 307 pshuflw m5, m13, 0xe 308 movss [outq+0x60], m14 309 pshuflw m6, m14, 0xe 310 movaps [outq+0x70], m15 311 pshuflw m7, m15, 0xe 312 addss m0, m1 313 addss m1, m2 314 movss [outq+0x08], m0 315 addss m2, m3 316 movss [outq+0x18], m1 317 addss m3, m4 318 movss [outq+0x28], m2 319 addss m4, m5 320 movss [outq+0x38], m3 321 addss m5, m6 322 movss [outq+0x48], m4 323 addss m6, m7 324 movss [outq+0x58], m5 325 movss [outq+0x68], m6 326 movss [outq+0x78], m7 327 328 PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7 329 movhlps m0, m1 330 pshufd m1, m1, 3 331 SWAP 0, 2, 4, 6, 8, 10, 12, 14 332 SWAP 1, 3, 5, 7, 9, 11, 13, 15 333%rep 7 334 movhlps m0, m1 335 pshufd m1, m1, 3 336 addss m15, m1 337 SWAP 0, 2, 4, 6, 8, 10, 12, 14 338 SWAP 1, 3, 5, 7, 9, 11, 13, 15 339%endrep 340%assign i 4 341%rep 15 342 addss m0, m1 343 movss [outq+i], m0 344 SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 345 %assign i i+8 346%endrep 347%endmacro 348 349%else ; ARCH_X86_32 350%macro SPILL 2 ; xmm#, mempos 351 movaps [outq+(%2-8)*16], m%1 352%endmacro 353%macro UNSPILL 2 354 movaps m%1, [outq+(%2-8)*16] 355%endmacro 356 357%define PASS6 PASS6_AND_PERMUTE 358%macro PASS5 0 359 movaps m2, [ps_cos_vec+160] 360 shufps m3, m3, 0xcc 361 362 BUTTERFLY3 m5, m3, m2, m1 363 SPILL 5, 8 364 365 UNSPILL 1, 9 366 BUTTERFLY3 m1, m3, m2, m5 367 SPILL 1, 14 368 369 BUTTERFLY3 m4, m3, m2, m5 370 SPILL 4, 12 371 372 BUTTERFLY3 m7, m3, m2, m5 373 SPILL 7, 13 374 375 UNSPILL 5, 10 376 BUTTERFLY3 m5, m3, m2, m7 377 SPILL 5, 10 378 379 UNSPILL 4, 11 380 BUTTERFLY3 m4, m3, m2, m7 381 SPILL 4, 11 382 383 BUTTERFLY3 m6, m3, m2, m7 384 SPILL 6, 9 385 386 BUTTERFLY3 m0, m3, m2, m7 387 SPILL 0, 15 388%endmacro 389%endif 390 391 392; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) 393%macro DCT32_FUNC 0 394cglobal dct32_float, 2, 3, 16, out, in, tmp 395 ; pass 1 396 397 movaps m0, [inq+0] 398 LOAD_INV m1, [inq+112] 399 BUTTERFLY m0, m1, [ps_cos_vec], m3 400 401 movaps m7, [inq+64] 402 LOAD_INV m4, [inq+48] 403 BUTTERFLY m7, m4, [ps_cos_vec+32], m3 404 405 ; pass 2 406 movaps m2, [ps_cos_vec+64] 407 BUTTERFLY m1, m4, m2, m3 408 SPILL 1, 11 409 SPILL 4, 8 410 411 ; pass 1 412 movaps m1, [inq+16] 413 LOAD_INV m6, [inq+96] 414 BUTTERFLY m1, m6, [ps_cos_vec+16], m3 415 416 movaps m4, [inq+80] 417 LOAD_INV m5, [inq+32] 418 BUTTERFLY m4, m5, [ps_cos_vec+48], m3 419 420 ; pass 2 421 BUTTERFLY m0, m7, m2, m3 422 423 movaps m2, [ps_cos_vec+80] 424 BUTTERFLY m6, m5, m2, m3 425 426 BUTTERFLY m1, m4, m2, m3 427 428 ; pass 3 429 movaps m2, [ps_cos_vec+96] 430 shufps m1, m1, 0x1b 431 BUTTERFLY m0, m1, m2, m3 432 SPILL 0, 15 433 SPILL 1, 14 434 435 UNSPILL 0, 8 436 shufps m5, m5, 0x1b 437 BUTTERFLY m0, m5, m2, m3 438 439 UNSPILL 1, 11 440 shufps m6, m6, 0x1b 441 BUTTERFLY m1, m6, m2, m3 442 SPILL 1, 11 443 444 shufps m4, m4, 0x1b 445 BUTTERFLY m7, m4, m2, m3 446 447 ; pass 4 448 movaps m3, [ps_p1p1m1m1+0] 449 movaps m2, [ps_cos_vec+128] 450 451 BUTTERFLY2 m5, m3, m2, m1 452 453 BUTTERFLY2 m0, m3, m2, m1 454 SPILL 0, 9 455 456 BUTTERFLY2 m6, m3, m2, m1 457 SPILL 6, 10 458 459 UNSPILL 0, 11 460 BUTTERFLY2 m0, m3, m2, m1 461 SPILL 0, 11 462 463 BUTTERFLY2 m4, m3, m2, m1 464 465 BUTTERFLY2 m7, m3, m2, m1 466 467 UNSPILL 6, 14 468 BUTTERFLY2 m6, m3, m2, m1 469 470 UNSPILL 0, 15 471 BUTTERFLY2 m0, m3, m2, m1 472 473 PASS5 474 PASS6 475 RET 476%endmacro 477 478%macro LOAD_INV 2 479%if cpuflag(sse2) 480 pshufd %1, %2, 0x1b 481%elif cpuflag(sse) 482 movaps %1, %2 483 shufps %1, %1, 0x1b 484%endif 485%endmacro 486 487%if ARCH_X86_32 488INIT_XMM sse 489DCT32_FUNC 490%endif 491INIT_XMM sse2 492DCT32_FUNC 493