1;****************************************************************************** 2;* SSE-optimized functions for the DCA decoder 3;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25pf_inv16: times 4 dd 0x3D800000 ; 1/16 26 27SECTION_TEXT 28 29; void decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS], 30; const int8_t hf_vq[1024][32], intptr_t vq_offset, 31; int32_t scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end) 32 33%macro DECODE_HF 0 34cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end 35 lea srcq, [srcq + offsetq] 36 shl startq, 2 37 mov offsetd, endm 38%define DICT offsetq 39 shl offsetq, 2 40 mov endm, offsetq 41.loop: 42%if ARCH_X86_64 43 mov offsetd, [scaleq + 2 * startq] 44 cvtsi2ss m0, offsetd 45%else 46 cvtsi2ss m0, [scaleq + 2 * startq] 47%endif 48 mov offsetd, [numq + startq] 49 mulss m0, [pf_inv16] 50 shl DICT, 5 51 shufps m0, m0, 0 52%if cpuflag(sse2) 53%if cpuflag(sse4) 54 pmovsxbd m1, [srcq + DICT + 0] 55 pmovsxbd m2, [srcq + DICT + 4] 56%else 57 movq m1, [srcq + DICT] 58 punpcklbw m1, m1 59 mova m2, m1 60 punpcklwd m1, m1 61 punpckhwd m2, m2 62 psrad m1, 24 63 psrad m2, 24 64%endif 65 cvtdq2ps m1, m1 66 cvtdq2ps m2, m2 67%else 68 movd mm0, [srcq + DICT + 0] 69 movd mm1, [srcq + DICT + 4] 70 punpcklbw mm0, mm0 71 punpcklbw mm1, mm1 72 movq mm2, mm0 73 movq mm3, mm1 74 punpcklwd mm0, mm0 75 punpcklwd mm1, mm1 76 punpckhwd mm2, mm2 77 punpckhwd mm3, mm3 78 psrad mm0, 24 79 psrad mm1, 24 80 psrad mm2, 24 81 psrad mm3, 24 82 cvtpi2ps m1, mm0 83 cvtpi2ps m2, mm1 84 cvtpi2ps m3, mm2 85 cvtpi2ps m4, mm3 86 shufps m0, m0, 0 87 shufps m1, m3, q1010 88 shufps m2, m4, q1010 89%endif 90 mulps m1, m0 91 mulps m2, m0 92 mova [dstq + 8 * startq + 0], m1 93 mova [dstq + 8 * startq + 16], m2 94 add startq, 4 95 cmp startq, endm 96 jl .loop 97.end: 98%if notcpuflag(sse2) 99 emms 100%endif 101 REP_RET 102%endmacro 103 104%if ARCH_X86_32 105INIT_XMM sse 106DECODE_HF 107%endif 108 109INIT_XMM sse2 110DECODE_HF 111 112INIT_XMM sse4 113DECODE_HF 114 115; %1=v0/v1 %2=in1 %3=in2 116%macro FIR_LOOP 2-3 117.loop%1: 118%define va m1 119%define vb m2 120%if %1 121%define OFFSET 0 122%else 123%define OFFSET NUM_COEF*count 124%endif 125; for v0, incrementing and for v1, decrementing 126 mova va, [cf0q + OFFSET] 127 mova vb, [cf0q + OFFSET + 4*NUM_COEF] 128%if %0 == 3 129 mova m4, [cf0q + OFFSET + mmsize] 130 mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize] 131%endif 132 mulps va, %2 133 mulps vb, %2 134%if %0 == 3 135%if cpuflag(fma3) 136 fmaddps va, m4, %3, va 137 fmaddps vb, m0, %3, vb 138%else 139 mulps m4, %3 140 mulps m0, %3 141 addps va, m4 142 addps vb, m0 143%endif 144%endif 145 ; va = va1 va2 va3 va4 146 ; vb = vb1 vb2 vb3 vb4 147%if %1 148 SWAP va, vb 149%endif 150 mova m4, va 151 unpcklps va, vb ; va3 vb3 va4 vb4 152 unpckhps m4, vb ; va1 vb1 va2 vb2 153 addps m4, va ; va1+3 vb1+3 va2+4 vb2+4 154 movhlps vb, m4 ; va1+3 vb1+3 155 addps vb, m4 ; va0..4 vb0..4 156 movlps [outq + count], vb 157%if %1 158 sub cf0q, 8*NUM_COEF 159%endif 160 add count, 8 161 jl .loop%1 162%endmacro 163 164; void dca_lfe_fir(float *out, float *in, float *coefs) 165%macro DCA_LFE_FIR 1 166cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0 167%define IN1 m3 168%define IN2 m5 169%define count inq 170%define NUM_COEF 4*(2-%1) 171%define NUM_OUT 32*(%1+1) 172 173 movu IN1, [inq + 4 - 1*mmsize] 174 shufps IN1, IN1, q0123 175%if %1 == 0 176 movu IN2, [inq + 4 - 2*mmsize] 177 shufps IN2, IN2, q0123 178%endif 179 180 mov count, -4*NUM_OUT 181 add cf0q, 4*NUM_COEF*NUM_OUT 182 add outq, 4*NUM_OUT 183 ; compute v0 first 184%if %1 == 0 185 FIR_LOOP 0, IN1, IN2 186%else 187 FIR_LOOP 0, IN1 188%endif 189 shufps IN1, IN1, q0123 190 mov count, -4*NUM_OUT 191 ; cf1 already correctly positioned 192 add outq, 4*NUM_OUT ; outq now at out2 193 sub cf0q, 8*NUM_COEF 194%if %1 == 0 195 shufps IN2, IN2, q0123 196 FIR_LOOP 1, IN2, IN1 197%else 198 FIR_LOOP 1, IN1 199%endif 200 RET 201%endmacro 202 203INIT_XMM sse 204DCA_LFE_FIR 0 205DCA_LFE_FIR 1 206%if HAVE_FMA3_EXTERNAL 207INIT_XMM fma3 208DCA_LFE_FIR 0 209%endif 210 211%macro SETZERO 1 212%if cpuflag(sse2) && notcpuflag(avx) 213 pxor %1, %1 214%else 215 xorps %1, %1, %1 216%endif 217%endmacro 218 219%macro SHUF 3 220%if cpuflag(avx) 221 mova %3, [%2 - 16] 222 vperm2f128 %1, %3, %3, 1 223 vshufps %1, %1, %1, q0123 224%elif cpuflag(sse2) 225 pshufd %1, [%2], q0123 226%else 227 mova %1, [%2] 228 shufps %1, %1, q0123 229%endif 230%endmacro 231 232%macro INNER_LOOP 1 233 ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i 234 ;~ a += window[i + j] * (-synth_buf[15 - i + j]) 235 ;~ b += window[i + j + 16] * (synth_buf[i + j]) 236 SHUF m5, ptr2 + j + (15 - 3) * 4, m6 237 mova m6, [ptr1 + j] 238%if ARCH_X86_64 239 SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12 240 mova m12, [ptr1 + j + mmsize] 241%endif 242%if cpuflag(fma3) 243 fmaddps m2, m6, [win + %1 + j + 16 * 4], m2 244 fnmaddps m1, m5, [win + %1 + j], m1 245%if ARCH_X86_64 246 fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8 247 fnmaddps m7, m11, [win + %1 + j + mmsize], m7 248%endif 249%else ; non-FMA 250 mulps m6, m6, [win + %1 + j + 16 * 4] 251 mulps m5, m5, [win + %1 + j] 252%if ARCH_X86_64 253 mulps m12, m12, [win + %1 + j + mmsize + 16 * 4] 254 mulps m11, m11, [win + %1 + j + mmsize] 255%endif 256 addps m2, m2, m6 257 subps m1, m1, m5 258%if ARCH_X86_64 259 addps m8, m8, m12 260 subps m7, m7, m11 261%endif 262%endif ; cpuflag(fma3) 263 ;~ c += window[i + j + 32] * (synth_buf[16 + i + j]) 264 ;~ d += window[i + j + 48] * (synth_buf[31 - i + j]) 265 SHUF m6, ptr2 + j + (31 - 3) * 4, m5 266 mova m5, [ptr1 + j + 16 * 4] 267%if ARCH_X86_64 268 SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11 269 mova m11, [ptr1 + j + mmsize + 16 * 4] 270%endif 271%if cpuflag(fma3) 272 fmaddps m3, m5, [win + %1 + j + 32 * 4], m3 273 fmaddps m4, m6, [win + %1 + j + 48 * 4], m4 274%if ARCH_X86_64 275 fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9 276 fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10 277%endif 278%else ; non-FMA 279 mulps m5, m5, [win + %1 + j + 32 * 4] 280 mulps m6, m6, [win + %1 + j + 48 * 4] 281%if ARCH_X86_64 282 mulps m11, m11, [win + %1 + j + mmsize + 32 * 4] 283 mulps m12, m12, [win + %1 + j + mmsize + 48 * 4] 284%endif 285 addps m3, m3, m5 286 addps m4, m4, m6 287%if ARCH_X86_64 288 addps m9, m9, m11 289 addps m10, m10, m12 290%endif 291%endif ; cpuflag(fma3) 292 sub j, 64 * 4 293%endmacro 294 295; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32], 296; const float window[512], float out[32], 297; intptr_t offset, float scale) 298%macro SYNTH_FILTER 0 299cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \ 300 synth_buf, synth_buf2, window, out, off, scale 301%define scale m0 302%if ARCH_X86_32 || WIN64 303%if cpuflag(sse2) && notcpuflag(avx) 304 movd scale, scalem 305 SPLATD m0 306%else 307 VBROADCASTSS m0, scalem 308%endif 309; Make sure offset is in a register and not on the stack 310%define OFFQ r4q 311%else 312 SPLATD xmm0 313%if cpuflag(avx) 314 vinsertf128 m0, m0, xmm0, 1 315%endif 316%define OFFQ offq 317%endif 318 ; prepare inner counter limit 1 319 mov r5q, 480 320 sub r5q, offmp 321 and r5q, -64 322 shl r5q, 2 323%if ARCH_X86_32 || notcpuflag(avx) 324 mov OFFQ, r5q 325%define i r5q 326 mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter 327%else 328%define i 0 329%define OFFQ r5q 330%endif 331 332%define buf2 synth_buf2q 333%if ARCH_X86_32 334 mov buf2, synth_buf2mp 335%endif 336.mainloop 337 ; m1 = a m2 = b m3 = c m4 = d 338 SETZERO m3 339 SETZERO m4 340 mova m1, [buf2 + i] 341 mova m2, [buf2 + i + 16 * 4] 342%if ARCH_X86_32 343%define ptr1 r0q 344%define ptr2 r1q 345%define win r2q 346%define j r3q 347 mov win, windowm 348 mov ptr1, synth_bufm 349%if ARCH_X86_32 || notcpuflag(avx) 350 add win, i 351 add ptr1, i 352%endif 353%else ; ARCH_X86_64 354%define ptr1 r6q 355%define ptr2 r7q ; must be loaded 356%define win r8q 357%define j r9q 358 SETZERO m9 359 SETZERO m10 360 mova m7, [buf2 + i + mmsize] 361 mova m8, [buf2 + i + mmsize + 16 * 4] 362 lea win, [windowq + i] 363 lea ptr1, [synth_bufq + i] 364%endif 365 mov ptr2, synth_bufmp 366 ; prepare the inner loop counter 367 mov j, OFFQ 368%if ARCH_X86_32 || notcpuflag(avx) 369 sub ptr2, i 370%endif 371.loop1: 372 INNER_LOOP 0 373 jge .loop1 374 375 mov j, 448 * 4 376 sub j, OFFQ 377 jz .end 378 sub ptr1, j 379 sub ptr2, j 380 add win, OFFQ ; now at j-64, so define OFFSET 381 sub j, 64 * 4 382.loop2: 383 INNER_LOOP 64 * 4 384 jge .loop2 385 386.end: 387%if ARCH_X86_32 388 mov buf2, synth_buf2m ; needed for next iteration anyway 389 mov outq, outmp ; j, which will be set again during it 390%endif 391 ;~ out[i] = a * scale; 392 ;~ out[i + 16] = b * scale; 393 mulps m1, m1, scale 394 mulps m2, m2, scale 395%if ARCH_X86_64 396 mulps m7, m7, scale 397 mulps m8, m8, scale 398%endif 399 ;~ synth_buf2[i] = c; 400 ;~ synth_buf2[i + 16] = d; 401 mova [buf2 + i + 0 * 4], m3 402 mova [buf2 + i + 16 * 4], m4 403%if ARCH_X86_64 404 mova [buf2 + i + 0 * 4 + mmsize], m9 405 mova [buf2 + i + 16 * 4 + mmsize], m10 406%endif 407 ;~ out[i] = a; 408 ;~ out[i + 16] = a; 409 mova [outq + i + 0 * 4], m1 410 mova [outq + i + 16 * 4], m2 411%if ARCH_X86_64 412 mova [outq + i + 0 * 4 + mmsize], m7 413 mova [outq + i + 16 * 4 + mmsize], m8 414%endif 415%if ARCH_X86_32 || notcpuflag(avx) 416 sub i, (ARCH_X86_64 + 1) * mmsize 417 jge .mainloop 418%endif 419 RET 420%endmacro 421 422%if ARCH_X86_32 423INIT_XMM sse 424SYNTH_FILTER 425%endif 426INIT_XMM sse2 427SYNTH_FILTER 428INIT_YMM avx 429SYNTH_FILTER 430INIT_YMM fma3 431SYNTH_FILTER 432