1;****************************************************************************** 2;* x86 optimized Format Conversion Utils 3;* Copyright (c) 2008 Loren Merritt 4;* 5;* This file is part of Libav. 6;* 7;* Libav is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* Libav is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with Libav; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "x86inc.asm" 23%include "x86util.asm" 24 25SECTION_TEXT 26 27;--------------------------------------------------------------------------------- 28; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len); 29;--------------------------------------------------------------------------------- 30%macro INT32_TO_FLOAT_FMUL_SCALAR 2 31%ifdef UNIX64 32cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len 33%else 34cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len 35%endif 36%ifdef WIN64 37 SWAP 0, 2 38%elifdef ARCH_X86_32 39 movss m0, mulm 40%endif 41 SPLATD m0 42 shl lenq, 2 43 add srcq, lenq 44 add dstq, lenq 45 neg lenq 46.loop: 47%ifidn %1, sse2 48 cvtdq2ps m1, [srcq+lenq ] 49 cvtdq2ps m2, [srcq+lenq+16] 50%else 51 cvtpi2ps m1, [srcq+lenq ] 52 cvtpi2ps m3, [srcq+lenq+ 8] 53 cvtpi2ps m2, [srcq+lenq+16] 54 cvtpi2ps m4, [srcq+lenq+24] 55 movlhps m1, m3 56 movlhps m2, m4 57%endif 58 mulps m1, m0 59 mulps m2, m0 60 mova [dstq+lenq ], m1 61 mova [dstq+lenq+16], m2 62 add lenq, 32 63 jl .loop 64 REP_RET 65%endmacro 66 67INIT_XMM 68%define SPLATD SPLATD_SSE 69%define movdqa movaps 70INT32_TO_FLOAT_FMUL_SCALAR sse, 5 71%undef movdqa 72%define SPLATD SPLATD_SSE2 73INT32_TO_FLOAT_FMUL_SCALAR sse2, 3 74%undef SPLATD 75 76 77;------------------------------------------------------------------------------ 78; void ff_float_to_int16(int16_t *dst, const float *src, long len); 79;------------------------------------------------------------------------------ 80%macro FLOAT_TO_INT16 2 81cglobal float_to_int16_%1, 3,3,%2, dst, src, len 82 add lenq, lenq 83 lea srcq, [srcq+2*lenq] 84 add dstq, lenq 85 neg lenq 86.loop: 87%ifidn %1, sse2 88 cvtps2dq m0, [srcq+2*lenq ] 89 cvtps2dq m1, [srcq+2*lenq+16] 90 packssdw m0, m1 91 mova [dstq+lenq], m0 92%else 93 cvtps2pi m0, [srcq+2*lenq ] 94 cvtps2pi m1, [srcq+2*lenq+ 8] 95 cvtps2pi m2, [srcq+2*lenq+16] 96 cvtps2pi m3, [srcq+2*lenq+24] 97 packssdw m0, m1 98 packssdw m2, m3 99 mova [dstq+lenq ], m0 100 mova [dstq+lenq+8], m2 101%endif 102 add lenq, 16 103 js .loop 104%ifnidn %1, sse2 105 emms 106%endif 107 REP_RET 108%endmacro 109 110INIT_XMM 111FLOAT_TO_INT16 sse2, 2 112INIT_MMX 113FLOAT_TO_INT16 sse, 0 114%define cvtps2pi pf2id 115FLOAT_TO_INT16 3dnow, 0 116%undef cvtps2pi 117 118 119;------------------------------------------------------------------------------- 120; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len); 121;------------------------------------------------------------------------------- 122%macro FLOAT_TO_INT16_INTERLEAVE2 1 123cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len 124 lea lenq, [4*r2q] 125 mov src1q, [src0q+gprsize] 126 mov src0q, [src0q] 127 add dstq, lenq 128 add src0q, lenq 129 add src1q, lenq 130 neg lenq 131.loop: 132%ifidn %1, sse2 133 cvtps2dq m0, [src0q+lenq] 134 cvtps2dq m1, [src1q+lenq] 135 packssdw m0, m1 136 movhlps m1, m0 137 punpcklwd m0, m1 138 mova [dstq+lenq], m0 139%else 140 cvtps2pi m0, [src0q+lenq ] 141 cvtps2pi m1, [src0q+lenq+8] 142 cvtps2pi m2, [src1q+lenq ] 143 cvtps2pi m3, [src1q+lenq+8] 144 packssdw m0, m1 145 packssdw m2, m3 146 mova m1, m0 147 punpcklwd m0, m2 148 punpckhwd m1, m2 149 mova [dstq+lenq ], m0 150 mova [dstq+lenq+8], m1 151%endif 152 add lenq, 16 153 js .loop 154%ifnidn %1, sse2 155 emms 156%endif 157 REP_RET 158%endmacro 159 160INIT_MMX 161%define cvtps2pi pf2id 162FLOAT_TO_INT16_INTERLEAVE2 3dnow 163%undef cvtps2pi 164%define movdqa movaps 165FLOAT_TO_INT16_INTERLEAVE2 sse 166%undef movdqa 167INIT_XMM 168FLOAT_TO_INT16_INTERLEAVE2 sse2 169 170 171%macro PSWAPD_SSE 2 172 pshufw %1, %2, 0x4e 173%endmacro 174%macro PSWAPD_3DN1 2 175 movq %1, %2 176 psrlq %1, 32 177 punpckldq %1, %2 178%endmacro 179 180%macro FLOAT_TO_INT16_INTERLEAVE6 1 181; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) 182cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 183%ifdef ARCH_X86_64 184 %define lend r10d 185 mov lend, r2d 186%else 187 %define lend dword r2m 188%endif 189 mov src1q, [srcq+1*gprsize] 190 mov src2q, [srcq+2*gprsize] 191 mov src3q, [srcq+3*gprsize] 192 mov src4q, [srcq+4*gprsize] 193 mov src5q, [srcq+5*gprsize] 194 mov srcq, [srcq] 195 sub src1q, srcq 196 sub src2q, srcq 197 sub src3q, srcq 198 sub src4q, srcq 199 sub src5q, srcq 200.loop: 201 cvtps2pi mm0, [srcq] 202 cvtps2pi mm1, [srcq+src1q] 203 cvtps2pi mm2, [srcq+src2q] 204 cvtps2pi mm3, [srcq+src3q] 205 cvtps2pi mm4, [srcq+src4q] 206 cvtps2pi mm5, [srcq+src5q] 207 packssdw mm0, mm3 208 packssdw mm1, mm4 209 packssdw mm2, mm5 210 pswapd mm3, mm0 211 punpcklwd mm0, mm1 212 punpckhwd mm1, mm2 213 punpcklwd mm2, mm3 214 pswapd mm3, mm0 215 punpckldq mm0, mm2 216 punpckhdq mm2, mm1 217 punpckldq mm1, mm3 218 movq [dstq ], mm0 219 movq [dstq+16], mm2 220 movq [dstq+ 8], mm1 221 add srcq, 8 222 add dstq, 24 223 sub lend, 2 224 jg .loop 225 emms 226 RET 227%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 228 229%define pswapd PSWAPD_SSE 230FLOAT_TO_INT16_INTERLEAVE6 sse 231%define cvtps2pi pf2id 232%define pswapd PSWAPD_3DN1 233FLOAT_TO_INT16_INTERLEAVE6 3dnow 234%undef pswapd 235FLOAT_TO_INT16_INTERLEAVE6 3dn2 236%undef cvtps2pi 237 238;----------------------------------------------------------------------------- 239; void ff_float_interleave6(float *dst, const float **src, unsigned int len); 240;----------------------------------------------------------------------------- 241 242%macro FLOAT_INTERLEAVE6 2 243cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5 244%ifdef ARCH_X86_64 245 %define lend r10d 246 mov lend, r2d 247%else 248 %define lend dword r2m 249%endif 250 mov src1q, [srcq+1*gprsize] 251 mov src2q, [srcq+2*gprsize] 252 mov src3q, [srcq+3*gprsize] 253 mov src4q, [srcq+4*gprsize] 254 mov src5q, [srcq+5*gprsize] 255 mov srcq, [srcq] 256 sub src1q, srcq 257 sub src2q, srcq 258 sub src3q, srcq 259 sub src4q, srcq 260 sub src5q, srcq 261.loop: 262%ifidn %1, sse 263 movaps m0, [srcq] 264 movaps m1, [srcq+src1q] 265 movaps m2, [srcq+src2q] 266 movaps m3, [srcq+src3q] 267 movaps m4, [srcq+src4q] 268 movaps m5, [srcq+src5q] 269 270 SBUTTERFLYPS 0, 1, 6 271 SBUTTERFLYPS 2, 3, 6 272 SBUTTERFLYPS 4, 5, 6 273 274 movaps m6, m4 275 shufps m4, m0, 0xe4 276 movlhps m0, m2 277 movhlps m6, m2 278 movaps [dstq ], m0 279 movaps [dstq+16], m4 280 movaps [dstq+32], m6 281 282 movaps m6, m5 283 shufps m5, m1, 0xe4 284 movlhps m1, m3 285 movhlps m6, m3 286 movaps [dstq+48], m1 287 movaps [dstq+64], m5 288 movaps [dstq+80], m6 289%else ; mmx 290 movq m0, [srcq] 291 movq m1, [srcq+src1q] 292 movq m2, [srcq+src2q] 293 movq m3, [srcq+src3q] 294 movq m4, [srcq+src4q] 295 movq m5, [srcq+src5q] 296 297 SBUTTERFLY dq, 0, 1, 6 298 SBUTTERFLY dq, 2, 3, 6 299 SBUTTERFLY dq, 4, 5, 6 300 movq [dstq ], m0 301 movq [dstq+ 8], m2 302 movq [dstq+16], m4 303 movq [dstq+24], m1 304 movq [dstq+32], m3 305 movq [dstq+40], m5 306%endif 307 add srcq, mmsize 308 add dstq, mmsize*6 309 sub lend, mmsize/4 310 jg .loop 311%ifidn %1, mmx 312 emms 313%endif 314 REP_RET 315%endmacro 316 317INIT_MMX 318FLOAT_INTERLEAVE6 mmx, 0 319INIT_XMM 320FLOAT_INTERLEAVE6 sse, 7 321 322;----------------------------------------------------------------------------- 323; void ff_float_interleave2(float *dst, const float **src, unsigned int len); 324;----------------------------------------------------------------------------- 325 326%macro FLOAT_INTERLEAVE2 2 327cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1 328 mov src1q, [srcq+gprsize] 329 mov srcq, [srcq ] 330 sub src1q, srcq 331.loop 332 MOVPS m0, [srcq ] 333 MOVPS m1, [srcq+src1q ] 334 MOVPS m3, [srcq +mmsize] 335 MOVPS m4, [srcq+src1q+mmsize] 336 337 MOVPS m2, m0 338 PUNPCKLDQ m0, m1 339 PUNPCKHDQ m2, m1 340 341 MOVPS m1, m3 342 PUNPCKLDQ m3, m4 343 PUNPCKHDQ m1, m4 344 345 MOVPS [dstq ], m0 346 MOVPS [dstq+1*mmsize], m2 347 MOVPS [dstq+2*mmsize], m3 348 MOVPS [dstq+3*mmsize], m1 349 350 add srcq, mmsize*2 351 add dstq, mmsize*4 352 sub lend, mmsize/2 353 jg .loop 354%ifidn %1, mmx 355 emms 356%endif 357 REP_RET 358%endmacro 359 360INIT_MMX 361%define MOVPS movq 362%define PUNPCKLDQ punpckldq 363%define PUNPCKHDQ punpckhdq 364FLOAT_INTERLEAVE2 mmx, 0 365INIT_XMM 366%define MOVPS movaps 367%define PUNPCKLDQ unpcklps 368%define PUNPCKHDQ unpckhps 369FLOAT_INTERLEAVE2 sse, 5 370