1;****************************************************************************** 2;* Copyright (c) 2012 Michael Niedermayer 3;* 4;* This file is part of FFmpeg. 5;* 6;* FFmpeg is free software; you can redistribute it and/or 7;* modify it under the terms of the GNU Lesser General Public 8;* License as published by the Free Software Foundation; either 9;* version 2.1 of the License, or (at your option) any later version. 10;* 11;* FFmpeg is distributed in the hope that it will be useful, 12;* but WITHOUT ANY WARRANTY; without even the implied warranty of 13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14;* Lesser General Public License for more details. 15;* 16;* You should have received a copy of the GNU Lesser General Public 17;* License along with FFmpeg; if not, write to the Free Software 18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19;****************************************************************************** 20 21%include "libavutil/x86/x86util.asm" 22 23SECTION_RODATA 32 24flt2pm31: times 8 dd 4.6566129e-10 25flt2p31 : times 8 dd 2147483648.0 26flt2p15 : times 8 dd 32768.0 27 28word_unpack_shuf : db 0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15 29 30SECTION .text 31 32 33;to, from, a/u, log2_outsize, log_intsize, const 34%macro PACK_2CH 5-7 35cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2 36 mov src2q , [srcq+gprsize] 37 mov srcq , [srcq] 38 mov dstq , [dstq] 39%ifidn %3, a 40 test dstq, mmsize-1 41 jne pack_2ch_%2_to_%1_u_int %+ SUFFIX 42 test srcq, mmsize-1 43 jne pack_2ch_%2_to_%1_u_int %+ SUFFIX 44 test src2q, mmsize-1 45 jne pack_2ch_%2_to_%1_u_int %+ SUFFIX 46%else 47pack_2ch_%2_to_%1_u_int %+ SUFFIX 48%endif 49 lea srcq , [srcq + (1<<%5)*lenq] 50 lea src2q, [src2q + (1<<%5)*lenq] 51 lea dstq , [dstq + (2<<%4)*lenq] 52 neg lenq 53 %7 m0,m1,m2,m3,m4,m5 54.next: 55%if %4 >= %5 56 mov%3 m0, [ srcq +(1<<%5)*lenq] 57 mova m1, m0 58 mov%3 m2, [ src2q+(1<<%5)*lenq] 59%if %5 == 1 60 punpcklwd m0, m2 61 punpckhwd m1, m2 62%else 63 punpckldq m0, m2 64 punpckhdq m1, m2 65%endif 66 %6 m0,m1,m2,m3,m4,m5 67%else 68 mov%3 m0, [ srcq +(1<<%5)*lenq] 69 mov%3 m1, [mmsize + srcq +(1<<%5)*lenq] 70 mov%3 m2, [ src2q+(1<<%5)*lenq] 71 mov%3 m3, [mmsize + src2q+(1<<%5)*lenq] 72 %6 m0,m1,m2,m3,m4,m5 73 mova m2, m0 74 punpcklwd m0, m1 75 punpckhwd m2, m1 76 SWAP 1,2 77%endif 78 mov%3 [ dstq+(2<<%4)*lenq], m0 79 mov%3 [ mmsize + dstq+(2<<%4)*lenq], m1 80%if %4 > %5 81 mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2 82 mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3 83 add lenq, 4*mmsize/(2<<%4) 84%else 85 add lenq, 2*mmsize/(2<<%4) 86%endif 87 jl .next 88 REP_RET 89%endmacro 90 91%macro UNPACK_2CH 5-7 92cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2 93 mov dst2q , [dstq+gprsize] 94 mov srcq , [srcq] 95 mov dstq , [dstq] 96%ifidn %3, a 97 test dstq, mmsize-1 98 jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX 99 test srcq, mmsize-1 100 jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX 101 test dst2q, mmsize-1 102 jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX 103%else 104unpack_2ch_%2_to_%1_u_int %+ SUFFIX 105%endif 106 lea srcq , [srcq + (2<<%5)*lenq] 107 lea dstq , [dstq + (1<<%4)*lenq] 108 lea dst2q, [dst2q + (1<<%4)*lenq] 109 neg lenq 110 %7 m0,m1,m2,m3,m4,m5 111 mova m6, [word_unpack_shuf] 112.next: 113 mov%3 m0, [ srcq +(2<<%5)*lenq] 114 mov%3 m2, [ mmsize + srcq +(2<<%5)*lenq] 115%if %5 == 1 116%ifidn SUFFIX, _ssse3 117 pshufb m0, m6 118 mova m1, m0 119 pshufb m2, m6 120 punpcklqdq m0,m2 121 punpckhqdq m1,m2 122%else 123 mova m1, m0 124 punpcklwd m0,m2 125 punpckhwd m1,m2 126 127 mova m2, m0 128 punpcklwd m0,m1 129 punpckhwd m2,m1 130 131 mova m1, m0 132 punpcklwd m0,m2 133 punpckhwd m1,m2 134%endif 135%else 136 mova m1, m0 137 shufps m0, m2, 10001000b 138 shufps m1, m2, 11011101b 139%endif 140%if %4 < %5 141 mov%3 m2, [2*mmsize + srcq +(2<<%5)*lenq] 142 mova m3, m2 143 mov%3 m4, [3*mmsize + srcq +(2<<%5)*lenq] 144 shufps m2, m4, 10001000b 145 shufps m3, m4, 11011101b 146 SWAP 1,2 147%endif 148 %6 m0,m1,m2,m3,m4,m5 149 mov%3 [ dstq+(1<<%4)*lenq], m0 150%if %4 > %5 151 mov%3 [ dst2q+(1<<%4)*lenq], m2 152 mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1 153 mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3 154 add lenq, 2*mmsize/(1<<%4) 155%else 156 mov%3 [ dst2q+(1<<%4)*lenq], m1 157 add lenq, mmsize/(1<<%4) 158%endif 159 jl .next 160 REP_RET 161%endmacro 162 163%macro CONV 5-7 164cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len 165 mov srcq , [srcq] 166 mov dstq , [dstq] 167%ifidn %3, a 168 test dstq, mmsize-1 169 jne %2_to_%1_u_int %+ SUFFIX 170 test srcq, mmsize-1 171 jne %2_to_%1_u_int %+ SUFFIX 172%else 173%2_to_%1_u_int %+ SUFFIX 174%endif 175 lea srcq , [srcq + (1<<%5)*lenq] 176 lea dstq , [dstq + (1<<%4)*lenq] 177 neg lenq 178 %7 m0,m1,m2,m3,m4,m5 179.next: 180 mov%3 m0, [ srcq +(1<<%5)*lenq] 181 mov%3 m1, [ mmsize + srcq +(1<<%5)*lenq] 182%if %4 < %5 183 mov%3 m2, [2*mmsize + srcq +(1<<%5)*lenq] 184 mov%3 m3, [3*mmsize + srcq +(1<<%5)*lenq] 185%endif 186 %6 m0,m1,m2,m3,m4,m5 187 mov%3 [ dstq+(1<<%4)*lenq], m0 188 mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1 189%if %4 > %5 190 mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2 191 mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3 192 add lenq, 4*mmsize/(1<<%4) 193%else 194 add lenq, 2*mmsize/(1<<%4) 195%endif 196 jl .next 197%if mmsize == 8 198 emms 199 RET 200%else 201 REP_RET 202%endif 203%endmacro 204 205%macro PACK_6CH 5-7 206cglobal pack_6ch_%2_to_%1_%3, 2,8,7, dst, src, src1, src2, src3, src4, src5, len 207%if ARCH_X86_64 208 mov lend, r2d 209%else 210 %define lend dword r2m 211%endif 212 mov src1q, [srcq+1*gprsize] 213 mov src2q, [srcq+2*gprsize] 214 mov src3q, [srcq+3*gprsize] 215 mov src4q, [srcq+4*gprsize] 216 mov src5q, [srcq+5*gprsize] 217 mov srcq, [srcq] 218 mov dstq, [dstq] 219%ifidn %3, a 220 test dstq, mmsize-1 221 jne pack_6ch_%2_to_%1_u_int %+ SUFFIX 222 test srcq, mmsize-1 223 jne pack_6ch_%2_to_%1_u_int %+ SUFFIX 224 test src2q, mmsize-1 225 jne pack_6ch_%2_to_%1_u_int %+ SUFFIX 226 test src3q, mmsize-1 227 jne pack_6ch_%2_to_%1_u_int %+ SUFFIX 228 test src4q, mmsize-1 229 jne pack_6ch_%2_to_%1_u_int %+ SUFFIX 230 test src5q, mmsize-1 231 jne pack_6ch_%2_to_%1_u_int %+ SUFFIX 232%else 233pack_6ch_%2_to_%1_u_int %+ SUFFIX 234%endif 235 sub src1q, srcq 236 sub src2q, srcq 237 sub src3q, srcq 238 sub src4q, srcq 239 sub src5q, srcq 240.loop: 241 mov%3 m0, [srcq ] 242 mov%3 m1, [srcq+src1q] 243 mov%3 m2, [srcq+src2q] 244 mov%3 m3, [srcq+src3q] 245 mov%3 m4, [srcq+src4q] 246 mov%3 m5, [srcq+src5q] 247 %7 x,x,x,x,m7,x 248%if cpuflag(sse4) 249 SBUTTERFLYPS 0, 1, 6 250 SBUTTERFLYPS 2, 3, 6 251 SBUTTERFLYPS 4, 5, 6 252 253 blendps m6, m4, m0, 1100b 254 movlhps m0, m2 255 movhlps m4, m2 256 blendps m2, m5, m1, 1100b 257 movlhps m1, m3 258 movhlps m5, m3 259 260 %6 m0,m6,x,x,m7,m3 261 %6 m4,m1,x,x,m7,m3 262 %6 m2,m5,x,x,m7,m3 263 264 mov %+ %3 %+ ps [dstq ], m0 265 mov %+ %3 %+ ps [dstq+16], m6 266 mov %+ %3 %+ ps [dstq+32], m4 267 mov %+ %3 %+ ps [dstq+48], m1 268 mov %+ %3 %+ ps [dstq+64], m2 269 mov %+ %3 %+ ps [dstq+80], m5 270%else ; mmx 271 SBUTTERFLY dq, 0, 1, 6 272 SBUTTERFLY dq, 2, 3, 6 273 SBUTTERFLY dq, 4, 5, 6 274 275 movq [dstq ], m0 276 movq [dstq+ 8], m2 277 movq [dstq+16], m4 278 movq [dstq+24], m1 279 movq [dstq+32], m3 280 movq [dstq+40], m5 281%endif 282 add srcq, mmsize 283 add dstq, mmsize*6 284 sub lend, mmsize/4 285 jg .loop 286%if mmsize == 8 287 emms 288 RET 289%else 290 REP_RET 291%endif 292%endmacro 293 294%macro INT16_TO_INT32_N 6 295 pxor m2, m2 296 pxor m3, m3 297 punpcklwd m2, m1 298 punpckhwd m3, m1 299 SWAP 4,0 300 pxor m0, m0 301 pxor m1, m1 302 punpcklwd m0, m4 303 punpckhwd m1, m4 304%endmacro 305 306%macro INT32_TO_INT16_N 6 307 psrad m0, 16 308 psrad m1, 16 309 psrad m2, 16 310 psrad m3, 16 311 packssdw m0, m1 312 packssdw m2, m3 313 SWAP 1,2 314%endmacro 315 316%macro INT32_TO_FLOAT_INIT 6 317 mova %5, [flt2pm31] 318%endmacro 319%macro INT32_TO_FLOAT_N 6 320 cvtdq2ps %1, %1 321 cvtdq2ps %2, %2 322 mulps %1, %1, %5 323 mulps %2, %2, %5 324%endmacro 325 326%macro FLOAT_TO_INT32_INIT 6 327 mova %5, [flt2p31] 328%endmacro 329%macro FLOAT_TO_INT32_N 6 330 mulps %1, %5 331 mulps %2, %5 332 cvtps2dq %6, %1 333 cmpnltps %1, %5 334 paddd %1, %6 335 cvtps2dq %6, %2 336 cmpnltps %2, %5 337 paddd %2, %6 338%endmacro 339 340%macro INT16_TO_FLOAT_INIT 6 341 mova m5, [flt2pm31] 342%endmacro 343%macro INT16_TO_FLOAT_N 6 344 INT16_TO_INT32_N %1,%2,%3,%4,%5,%6 345 cvtdq2ps m0, m0 346 cvtdq2ps m1, m1 347 cvtdq2ps m2, m2 348 cvtdq2ps m3, m3 349 mulps m0, m0, m5 350 mulps m1, m1, m5 351 mulps m2, m2, m5 352 mulps m3, m3, m5 353%endmacro 354 355%macro FLOAT_TO_INT16_INIT 6 356 mova m5, [flt2p15] 357%endmacro 358%macro FLOAT_TO_INT16_N 6 359 mulps m0, m5 360 mulps m1, m5 361 mulps m2, m5 362 mulps m3, m5 363 cvtps2dq m0, m0 364 cvtps2dq m1, m1 365 packssdw m0, m1 366 cvtps2dq m1, m2 367 cvtps2dq m3, m3 368 packssdw m1, m3 369%endmacro 370 371%macro NOP_N 0-6 372%endmacro 373 374INIT_MMX mmx 375CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N 376CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N 377CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N 378CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N 379 380PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N 381PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N 382 383INIT_XMM sse2 384CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N 385CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N 386CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N 387CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N 388 389PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N 390PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N 391PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N 392PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N 393PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N 394PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N 395PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N 396PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N 397 398UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N 399UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N 400UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N 401UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N 402UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N 403UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N 404UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N 405UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N 406 407CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT 408CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT 409CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT 410CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT 411CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT 412CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT 413CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT 414CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT 415 416PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT 417PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT 418PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT 419PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT 420PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT 421PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT 422PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT 423PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT 424 425UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT 426UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT 427UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT 428UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT 429UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT 430UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT 431UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT 432UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT 433 434 435INIT_XMM ssse3 436UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N 437UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N 438UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N 439UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N 440UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT 441UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT 442 443INIT_XMM sse4 444PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N 445PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N 446 447PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT 448PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT 449PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT 450PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT 451 452%if HAVE_AVX_EXTERNAL 453INIT_XMM avx 454PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N 455PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N 456 457PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT 458PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT 459PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT 460PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT 461 462INIT_YMM avx 463CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT 464CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT 465%endif 466