1;***************************************************************************** 2;* x86-optimized Float DSP functions 3;* 4;* Copyright 2006 Loren Merritt 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "x86util.asm" 24 25SECTION .text 26 27;----------------------------------------------------------------------------- 28; void vector_fmul(float *dst, const float *src0, const float *src1, int len) 29;----------------------------------------------------------------------------- 30%macro VECTOR_FMUL 0 31cglobal vector_fmul, 4,4,2, dst, src0, src1, len 32 lea lenq, [lend*4 - 64] 33ALIGN 16 34.loop: 35%assign a 0 36%rep 32/mmsize 37 mova m0, [src0q + lenq + (a+0)*mmsize] 38 mova m1, [src0q + lenq + (a+1)*mmsize] 39 mulps m0, m0, [src1q + lenq + (a+0)*mmsize] 40 mulps m1, m1, [src1q + lenq + (a+1)*mmsize] 41 mova [dstq + lenq + (a+0)*mmsize], m0 42 mova [dstq + lenq + (a+1)*mmsize], m1 43%assign a a+2 44%endrep 45 46 sub lenq, 64 47 jge .loop 48 REP_RET 49%endmacro 50 51INIT_XMM sse 52VECTOR_FMUL 53%if HAVE_AVX_EXTERNAL 54INIT_YMM avx 55VECTOR_FMUL 56%endif 57 58;------------------------------------------------------------------------------ 59; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) 60;------------------------------------------------------------------------------ 61 62%macro VECTOR_FMAC_SCALAR 0 63%if UNIX64 64cglobal vector_fmac_scalar, 3,3,5, dst, src, len 65%else 66cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len 67%endif 68%if ARCH_X86_32 69 VBROADCASTSS m0, mulm 70%else 71%if WIN64 72 SWAP 0, 2 73%endif 74 shufps xm0, xm0, 0 75%if cpuflag(avx) 76 vinsertf128 m0, m0, xm0, 1 77%endif 78%endif 79 lea lenq, [lend*4-64] 80.loop: 81%if cpuflag(fma3) 82 mova m1, [dstq+lenq] 83 mova m2, [dstq+lenq+1*mmsize] 84 fmaddps m1, m0, [srcq+lenq], m1 85 fmaddps m2, m0, [srcq+lenq+1*mmsize], m2 86%else ; cpuflag 87 mulps m1, m0, [srcq+lenq] 88 mulps m2, m0, [srcq+lenq+1*mmsize] 89%if mmsize < 32 90 mulps m3, m0, [srcq+lenq+2*mmsize] 91 mulps m4, m0, [srcq+lenq+3*mmsize] 92%endif ; mmsize 93 addps m1, m1, [dstq+lenq] 94 addps m2, m2, [dstq+lenq+1*mmsize] 95%if mmsize < 32 96 addps m3, m3, [dstq+lenq+2*mmsize] 97 addps m4, m4, [dstq+lenq+3*mmsize] 98%endif ; mmsize 99%endif ; cpuflag 100 mova [dstq+lenq], m1 101 mova [dstq+lenq+1*mmsize], m2 102%if mmsize < 32 103 mova [dstq+lenq+2*mmsize], m3 104 mova [dstq+lenq+3*mmsize], m4 105%endif ; mmsize 106 sub lenq, 64 107 jge .loop 108 REP_RET 109%endmacro 110 111INIT_XMM sse 112VECTOR_FMAC_SCALAR 113%if HAVE_AVX_EXTERNAL 114INIT_YMM avx 115VECTOR_FMAC_SCALAR 116%endif 117%if HAVE_FMA3_EXTERNAL 118INIT_YMM fma3 119VECTOR_FMAC_SCALAR 120%endif 121 122;------------------------------------------------------------------------------ 123; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) 124;------------------------------------------------------------------------------ 125 126%macro VECTOR_FMUL_SCALAR 0 127%if UNIX64 128cglobal vector_fmul_scalar, 3,3,2, dst, src, len 129%else 130cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len 131%endif 132%if ARCH_X86_32 133 movss m0, mulm 134%elif WIN64 135 SWAP 0, 2 136%endif 137 shufps m0, m0, 0 138 lea lenq, [lend*4-mmsize] 139.loop: 140 mova m1, [srcq+lenq] 141 mulps m1, m0 142 mova [dstq+lenq], m1 143 sub lenq, mmsize 144 jge .loop 145 REP_RET 146%endmacro 147 148INIT_XMM sse 149VECTOR_FMUL_SCALAR 150 151;------------------------------------------------------------------------------ 152; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, 153; int len) 154;------------------------------------------------------------------------------ 155 156%macro VECTOR_DMUL_SCALAR 0 157%if ARCH_X86_32 158cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr 159 mov lenq, lenaddrm 160%elif UNIX64 161cglobal vector_dmul_scalar, 3,3,3, dst, src, len 162%else 163cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len 164%endif 165%if ARCH_X86_32 166 VBROADCASTSD m0, mulm 167%else 168%if WIN64 169 SWAP 0, 2 170%endif 171 movlhps xm0, xm0 172%if cpuflag(avx) 173 vinsertf128 ym0, ym0, xm0, 1 174%endif 175%endif 176 lea lenq, [lend*8-2*mmsize] 177.loop: 178 mulpd m1, m0, [srcq+lenq ] 179 mulpd m2, m0, [srcq+lenq+mmsize] 180 mova [dstq+lenq ], m1 181 mova [dstq+lenq+mmsize], m2 182 sub lenq, 2*mmsize 183 jge .loop 184 REP_RET 185%endmacro 186 187INIT_XMM sse2 188VECTOR_DMUL_SCALAR 189%if HAVE_AVX_EXTERNAL 190INIT_YMM avx 191VECTOR_DMUL_SCALAR 192%endif 193 194;----------------------------------------------------------------------------- 195; vector_fmul_window(float *dst, const float *src0, 196; const float *src1, const float *win, int len); 197;----------------------------------------------------------------------------- 198%macro VECTOR_FMUL_WINDOW 0 199cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1 200 shl lend, 2 201 lea len1q, [lenq - mmsize] 202 add src0q, lenq 203 add dstq, lenq 204 add winq, lenq 205 neg lenq 206.loop 207 mova m0, [winq + lenq] 208 mova m4, [src0q + lenq] 209%if cpuflag(sse) 210 mova m1, [winq + len1q] 211 mova m5, [src1q + len1q] 212 shufps m1, m1, 0x1b 213 shufps m5, m5, 0x1b 214 mova m2, m0 215 mova m3, m1 216 mulps m2, m4 217 mulps m3, m5 218 mulps m1, m4 219 mulps m0, m5 220 addps m2, m3 221 subps m1, m0 222 shufps m2, m2, 0x1b 223%else 224 pswapd m1, [winq + len1q] 225 pswapd m5, [src1q + len1q] 226 mova m2, m0 227 mova m3, m1 228 pfmul m2, m4 229 pfmul m3, m5 230 pfmul m1, m4 231 pfmul m0, m5 232 pfadd m2, m3 233 pfsub m1, m0 234 pswapd m2, m2 235%endif 236 mova [dstq + lenq], m1 237 mova [dstq + len1q], m2 238 sub len1q, mmsize 239 add lenq, mmsize 240 jl .loop 241%if mmsize == 8 242 femms 243%endif 244 REP_RET 245%endmacro 246 247INIT_MMX 3dnowext 248VECTOR_FMUL_WINDOW 249INIT_XMM sse 250VECTOR_FMUL_WINDOW 251 252;----------------------------------------------------------------------------- 253; vector_fmul_add(float *dst, const float *src0, const float *src1, 254; const float *src2, int len) 255;----------------------------------------------------------------------------- 256%macro VECTOR_FMUL_ADD 0 257cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len 258 lea lenq, [lend*4 - 2*mmsize] 259ALIGN 16 260.loop: 261 mova m0, [src0q + lenq] 262 mova m1, [src0q + lenq + mmsize] 263%if cpuflag(fma3) 264 mova m2, [src2q + lenq] 265 mova m3, [src2q + lenq + mmsize] 266 fmaddps m0, m0, [src1q + lenq], m2 267 fmaddps m1, m1, [src1q + lenq + mmsize], m3 268%else 269 mulps m0, m0, [src1q + lenq] 270 mulps m1, m1, [src1q + lenq + mmsize] 271 addps m0, m0, [src2q + lenq] 272 addps m1, m1, [src2q + lenq + mmsize] 273%endif 274 mova [dstq + lenq], m0 275 mova [dstq + lenq + mmsize], m1 276 277 sub lenq, 2*mmsize 278 jge .loop 279 REP_RET 280%endmacro 281 282INIT_XMM sse 283VECTOR_FMUL_ADD 284%if HAVE_AVX_EXTERNAL 285INIT_YMM avx 286VECTOR_FMUL_ADD 287%endif 288%if HAVE_FMA3_EXTERNAL 289INIT_YMM fma3 290VECTOR_FMUL_ADD 291%endif 292 293;----------------------------------------------------------------------------- 294; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, 295; int len) 296;----------------------------------------------------------------------------- 297%macro VECTOR_FMUL_REVERSE 0 298cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len 299 lea lenq, [lend*4 - 2*mmsize] 300ALIGN 16 301.loop: 302%if cpuflag(avx) 303 vmovaps xmm0, [src1q + 16] 304 vinsertf128 m0, m0, [src1q], 1 305 vshufps m0, m0, m0, q0123 306 vmovaps xmm1, [src1q + mmsize + 16] 307 vinsertf128 m1, m1, [src1q + mmsize], 1 308 vshufps m1, m1, m1, q0123 309%else 310 mova m0, [src1q] 311 mova m1, [src1q + mmsize] 312 shufps m0, m0, q0123 313 shufps m1, m1, q0123 314%endif 315 mulps m0, m0, [src0q + lenq + mmsize] 316 mulps m1, m1, [src0q + lenq] 317 mova [dstq + lenq + mmsize], m0 318 mova [dstq + lenq], m1 319 add src1q, 2*mmsize 320 sub lenq, 2*mmsize 321 jge .loop 322 REP_RET 323%endmacro 324 325INIT_XMM sse 326VECTOR_FMUL_REVERSE 327%if HAVE_AVX_EXTERNAL 328INIT_YMM avx 329VECTOR_FMUL_REVERSE 330%endif 331 332; float scalarproduct_float_sse(const float *v1, const float *v2, int len) 333INIT_XMM sse 334cglobal scalarproduct_float, 3,3,2, v1, v2, offset 335 neg offsetq 336 shl offsetq, 2 337 sub v1q, offsetq 338 sub v2q, offsetq 339 xorps xmm0, xmm0 340.loop: 341 movaps xmm1, [v1q+offsetq] 342 mulps xmm1, [v2q+offsetq] 343 addps xmm0, xmm1 344 add offsetq, 16 345 js .loop 346 movhlps xmm1, xmm0 347 addps xmm0, xmm1 348 movss xmm1, xmm0 349 shufps xmm0, xmm0, 1 350 addss xmm0, xmm1 351%if ARCH_X86_64 == 0 352 movss r0m, xmm0 353 fld dword r0m 354%endif 355 RET 356 357;----------------------------------------------------------------------------- 358; void ff_butterflies_float(float *src0, float *src1, int len); 359;----------------------------------------------------------------------------- 360INIT_XMM sse 361cglobal butterflies_float, 3,3,3, src0, src1, len 362%if ARCH_X86_64 363 movsxd lenq, lend 364%endif 365 test lenq, lenq 366 jz .end 367 shl lenq, 2 368 add src0q, lenq 369 add src1q, lenq 370 neg lenq 371.loop: 372 mova m0, [src0q + lenq] 373 mova m1, [src1q + lenq] 374 subps m2, m0, m1 375 addps m0, m0, m1 376 mova [src1q + lenq], m2 377 mova [src0q + lenq], m0 378 add lenq, mmsize 379 jl .loop 380.end: 381 REP_RET 382