1;****************************************************************************** 2;* Copyright (c) 2012 Michael Niedermayer 3;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com> 4;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25%if ARCH_X86_64 26%define pointer resq 27%else 28%define pointer resd 29%endif 30 31struc ResampleContext 32 .av_class: pointer 1 33 .filter_bank: pointer 1 34 .filter_length: resd 1 35 .filter_alloc: resd 1 36 .ideal_dst_incr: resd 1 37 .dst_incr: resd 1 38 .dst_incr_div: resd 1 39 .dst_incr_mod: resd 1 40 .index: resd 1 41 .frac: resd 1 42 .src_incr: resd 1 43 .compensation_distance: resd 1 44 .phase_shift: resd 1 45 .phase_mask: resd 1 46 47 ; there's a few more here but we only care about the first few 48endstruc 49 50SECTION_RODATA 51 52pf_1: dd 1.0 53pdbl_1: dq 1.0 54pd_0x4000: dd 0x4000 55 56SECTION .text 57 58%macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant 59; int resample_common_$format(ResampleContext *ctx, $format *dst, 60; const $format *src, int size, int update_ctx) 61%if ARCH_X86_64 ; unix64 and win64 62cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_shift, index, frac, \ 63 dst_incr_mod, size, min_filter_count_x4, \ 64 min_filter_len_x4, dst_incr_div, src_incr, \ 65 phase_mask, dst_end, filter_bank 66 67 ; use red-zone for variable storage 68%define ctx_stackq [rsp-0x8] 69%define src_stackq [rsp-0x10] 70%if WIN64 71%define update_context_stackd r4m 72%else ; unix64 73%define update_context_stackd [rsp-0x14] 74%endif 75 76 ; load as many variables in registers as possible; for the rest, store 77 ; on stack so that we have 'ctx' available as one extra register 78 mov sized, r3d 79 mov phase_maskd, [ctxq+ResampleContext.phase_mask] 80%if UNIX64 81 mov update_context_stackd, r4d 82%endif 83 mov indexd, [ctxq+ResampleContext.index] 84 mov fracd, [ctxq+ResampleContext.frac] 85 mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod] 86 mov filter_bankq, [ctxq+ResampleContext.filter_bank] 87 mov src_incrd, [ctxq+ResampleContext.src_incr] 88 mov ctx_stackq, ctxq 89 mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] 90 mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] 91 shl min_filter_len_x4d, %3 92 lea dst_endq, [dstq+sizeq*%2] 93 94%if UNIX64 95 mov ecx, [ctxq+ResampleContext.phase_shift] 96 mov edi, [ctxq+ResampleContext.filter_alloc] 97 98 DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ 99 filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ 100 src_incr, phase_mask, dst_end, filter_bank 101%elif WIN64 102 mov R9d, [ctxq+ResampleContext.filter_alloc] 103 mov ecx, [ctxq+ResampleContext.phase_shift] 104 105 DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ 106 filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ 107 src_incr, phase_mask, dst_end, filter_bank 108%endif 109 110 neg min_filter_len_x4q 111 sub filter_bankq, min_filter_len_x4q 112 sub srcq, min_filter_len_x4q 113 mov src_stackq, srcq 114%else ; x86-32 115cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \ 116 index, min_filter_length_x4, filter_bank 117 118 ; push temp variables to stack 119%define ctx_stackq r0mp 120%define src_stackq r2mp 121%define update_context_stackd r4m 122 123 mov dstq, r1mp 124 mov r3, r3mp 125 lea r3, [dstq+r3*%2] 126 PUSH dword [ctxq+ResampleContext.dst_incr_div] 127 PUSH dword [ctxq+ResampleContext.dst_incr_mod] 128 PUSH dword [ctxq+ResampleContext.filter_alloc] 129 PUSH r3 130 PUSH dword [ctxq+ResampleContext.phase_mask] 131 PUSH dword [ctxq+ResampleContext.src_incr] 132 mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] 133 mov indexd, [ctxq+ResampleContext.index] 134 shl min_filter_length_x4d, %3 135 mov fracd, [ctxq+ResampleContext.frac] 136 neg min_filter_length_x4q 137 mov filter_bankq, [ctxq+ResampleContext.filter_bank] 138 sub r2mp, min_filter_length_x4q 139 sub filter_bankq, min_filter_length_x4q 140 PUSH min_filter_length_x4q 141 PUSH filter_bankq 142 mov phase_shiftd, [ctxq+ResampleContext.phase_shift] 143 144 DEFINE_ARGS src, phase_shift, dst, frac, index, min_filter_count_x4, filter 145 146%define filter_bankq dword [rsp+0x0] 147%define min_filter_length_x4q dword [rsp+0x4] 148%define src_incrd dword [rsp+0x8] 149%define phase_maskd dword [rsp+0xc] 150%define dst_endq dword [rsp+0x10] 151%define filter_allocd dword [rsp+0x14] 152%define dst_incr_modd dword [rsp+0x18] 153%define dst_incr_divd dword [rsp+0x1c] 154 155 mov srcq, r2mp 156%endif 157 158.loop: 159 mov filterd, filter_allocd 160 imul filterd, indexd 161%if ARCH_X86_64 162 mov min_filter_count_x4q, min_filter_len_x4q 163 lea filterq, [filter_bankq+filterq*%2] 164%else ; x86-32 165 mov min_filter_count_x4q, filter_bankq 166 lea filterq, [min_filter_count_x4q+filterq*%2] 167 mov min_filter_count_x4q, min_filter_length_x4q 168%endif 169%ifidn %1, int16 170 movd m0, [pd_0x4000] 171%else ; float/double 172 xorps m0, m0, m0 173%endif 174 175 align 16 176.inner_loop: 177 movu m1, [srcq+min_filter_count_x4q*1] 178%ifidn %1, int16 179 PMADCSWD m0, m1, [filterq+min_filter_count_x4q*1], m0, m1 180%else ; float/double 181%if cpuflag(fma4) || cpuflag(fma3) 182 fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0 183%else 184 mulp%4 m1, m1, [filterq+min_filter_count_x4q*1] 185 addp%4 m0, m0, m1 186%endif ; cpuflag 187%endif 188 add min_filter_count_x4q, mmsize 189 js .inner_loop 190 191%ifidn %1, int16 192 HADDD m0, m1 193 psrad m0, 15 194 add fracd, dst_incr_modd 195 packssdw m0, m0 196 add indexd, dst_incr_divd 197 movd [dstq], m0 198%else ; float/double 199 ; horizontal sum & store 200%if mmsize == 32 201 vextractf128 xm1, m0, 0x1 202 addps xm0, xm1 203%endif 204 movhlps xm1, xm0 205%ifidn %1, float 206 addps xm0, xm1 207 shufps xm1, xm0, xm0, q0001 208%endif 209 add fracd, dst_incr_modd 210 addp%4 xm0, xm1 211 add indexd, dst_incr_divd 212 movs%4 [dstq], xm0 213%endif 214 cmp fracd, src_incrd 215 jl .skip 216 sub fracd, src_incrd 217 inc indexd 218 219%if UNIX64 220 DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ 221 index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ 222 src_incr, phase_mask, dst_end, filter_bank 223%elif WIN64 224 DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ 225 index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ 226 src_incr, phase_mask, dst_end, filter_bank 227%else ; x86-32 228 DEFINE_ARGS src, phase_shift, dst, frac, index, index_incr 229%endif 230 231.skip: 232 mov index_incrd, indexd 233 add dstq, %2 234 and indexd, phase_maskd 235 sar index_incrd, phase_shiftb 236 lea srcq, [srcq+index_incrq*%2] 237 cmp dstq, dst_endq 238 jne .loop 239 240%if ARCH_X86_64 241 DEFINE_ARGS ctx, dst, src, phase_shift, index, frac 242%else ; x86-32 243 DEFINE_ARGS src, ctx, update_context, frac, index 244%endif 245 246 cmp dword update_context_stackd, 0 247 jz .skip_store 248 ; strictly speaking, the function should always return the consumed 249 ; number of bytes; however, we only use the value if update_context 250 ; is true, so let's just leave it uninitialized otherwise 251 mov ctxq, ctx_stackq 252 movifnidn rax, srcq 253 mov [ctxq+ResampleContext.frac ], fracd 254 sub rax, src_stackq 255 mov [ctxq+ResampleContext.index], indexd 256 shr rax, %3 257 258.skip_store: 259%if ARCH_X86_32 260 ADD rsp, 0x20 261%endif 262 RET 263 264; int resample_linear_$format(ResampleContext *ctx, float *dst, 265; const float *src, int size, int update_ctx) 266%if ARCH_X86_64 ; unix64 and win64 267%if UNIX64 268cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_shift, index, frac, \ 269 size, dst_incr_mod, min_filter_count_x4, \ 270 min_filter_len_x4, dst_incr_div, src_incr, \ 271 src, dst_end, filter_bank 272 273 mov srcq, r2mp 274%else ; win64 275cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index, frac, \ 276 size, dst_incr_mod, min_filter_count_x4, \ 277 min_filter_len_x4, dst_incr_div, src_incr, \ 278 dst, dst_end, filter_bank 279 280 mov dstq, r1mp 281%endif 282 283 ; use red-zone for variable storage 284%define ctx_stackq [rsp-0x8] 285%define src_stackq [rsp-0x10] 286%define phase_mask_stackd [rsp-0x14] 287%if WIN64 288%define update_context_stackd r4m 289%else ; unix64 290%define update_context_stackd [rsp-0x18] 291%endif 292 293 ; load as many variables in registers as possible; for the rest, store 294 ; on stack so that we have 'ctx' available as one extra register 295 mov sized, r3d 296 mov phase_maskd, [ctxq+ResampleContext.phase_mask] 297%if UNIX64 298 mov update_context_stackd, r4d 299%endif 300 mov indexd, [ctxq+ResampleContext.index] 301 mov fracd, [ctxq+ResampleContext.frac] 302 mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod] 303 mov filter_bankq, [ctxq+ResampleContext.filter_bank] 304 mov src_incrd, [ctxq+ResampleContext.src_incr] 305 mov ctx_stackq, ctxq 306 mov phase_mask_stackd, phase_maskd 307 mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] 308%ifidn %1, int16 309 movd m4, [pd_0x4000] 310%else ; float/double 311 cvtsi2s%4 xm0, src_incrd 312 movs%4 xm4, [%5] 313 divs%4 xm4, xm0 314%endif 315 mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] 316 shl min_filter_len_x4d, %3 317 lea dst_endq, [dstq+sizeq*%2] 318 319%if UNIX64 320 mov ecx, [ctxq+ResampleContext.phase_shift] 321 mov edi, [ctxq+ResampleContext.filter_alloc] 322 323 DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, filter1, \ 324 dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ 325 dst_incr_div, src_incr, src, dst_end, filter_bank 326%elif WIN64 327 mov R9d, [ctxq+ResampleContext.filter_alloc] 328 mov ecx, [ctxq+ResampleContext.phase_shift] 329 330 DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, filter1, \ 331 dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ 332 dst_incr_div, src_incr, dst, dst_end, filter_bank 333%endif 334 335 neg min_filter_len_x4q 336 sub filter_bankq, min_filter_len_x4q 337 sub srcq, min_filter_len_x4q 338 mov src_stackq, srcq 339%else ; x86-32 340cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ 341 frac, index, dst, filter_bank 342 343 ; push temp variables to stack 344%define ctx_stackq r0mp 345%define src_stackq r2mp 346%define update_context_stackd r4m 347 348 mov dstq, r1mp 349 mov r3, r3mp 350 lea r3, [dstq+r3*%2] 351 PUSH dword [ctxq+ResampleContext.dst_incr_div] 352 PUSH r3 353 mov r3, dword [ctxq+ResampleContext.filter_alloc] 354 PUSH dword [ctxq+ResampleContext.dst_incr_mod] 355 PUSH r3 356 shl r3, %3 357 PUSH r3 358 mov r3, dword [ctxq+ResampleContext.src_incr] 359 PUSH dword [ctxq+ResampleContext.phase_mask] 360 PUSH r3d 361%ifidn %1, int16 362 movd m4, [pd_0x4000] 363%else ; float/double 364 cvtsi2s%4 xm0, r3d 365 movs%4 xm4, [%5] 366 divs%4 xm4, xm0 367%endif 368 mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] 369 mov indexd, [ctxq+ResampleContext.index] 370 shl min_filter_length_x4d, %3 371 mov fracd, [ctxq+ResampleContext.frac] 372 neg min_filter_length_x4q 373 mov filter_bankq, [ctxq+ResampleContext.filter_bank] 374 sub r2mp, min_filter_length_x4q 375 sub filter_bankq, min_filter_length_x4q 376 PUSH min_filter_length_x4q 377 PUSH filter_bankq 378 PUSH dword [ctxq+ResampleContext.phase_shift] 379 380 DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src 381 382%define phase_shift_stackd dword [rsp+0x0] 383%define filter_bankq dword [rsp+0x4] 384%define min_filter_length_x4q dword [rsp+0x8] 385%define src_incrd dword [rsp+0xc] 386%define phase_mask_stackd dword [rsp+0x10] 387%define filter_alloc_x4q dword [rsp+0x14] 388%define filter_allocd dword [rsp+0x18] 389%define dst_incr_modd dword [rsp+0x1c] 390%define dst_endq dword [rsp+0x20] 391%define dst_incr_divd dword [rsp+0x24] 392 393 mov srcq, r2mp 394%endif 395 396.loop: 397 mov filter1d, filter_allocd 398 imul filter1d, indexd 399%if ARCH_X86_64 400 mov min_filter_count_x4q, min_filter_len_x4q 401 lea filter1q, [filter_bankq+filter1q*%2] 402 lea filter2q, [filter1q+filter_allocq*%2] 403%else ; x86-32 404 mov min_filter_count_x4q, filter_bankq 405 lea filter1q, [min_filter_count_x4q+filter1q*%2] 406 mov min_filter_count_x4q, min_filter_length_x4q 407 mov filter2q, filter1q 408 add filter2q, filter_alloc_x4q 409%endif 410%ifidn %1, int16 411 mova m0, m4 412 mova m2, m4 413%else ; float/double 414 xorps m0, m0, m0 415 xorps m2, m2, m2 416%endif 417 418 align 16 419.inner_loop: 420 movu m1, [srcq+min_filter_count_x4q*1] 421%ifidn %1, int16 422%if cpuflag(xop) 423 vpmadcswd m2, m1, [filter2q+min_filter_count_x4q*1], m2 424 vpmadcswd m0, m1, [filter1q+min_filter_count_x4q*1], m0 425%else 426 pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1] 427 pmaddwd m1, [filter1q+min_filter_count_x4q*1] 428 paddd m2, m3 429 paddd m0, m1 430%endif ; cpuflag 431%else ; float/double 432%if cpuflag(fma4) || cpuflag(fma3) 433 fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2 434 fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0 435%else 436 mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1] 437 mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1] 438 addp%4 m2, m2, m3 439 addp%4 m0, m0, m1 440%endif ; cpuflag 441%endif 442 add min_filter_count_x4q, mmsize 443 js .inner_loop 444 445%ifidn %1, int16 446%if mmsize == 16 447%if cpuflag(xop) 448 vphadddq m2, m2 449 vphadddq m0, m0 450%endif 451 pshufd m3, m2, q0032 452 pshufd m1, m0, q0032 453 paddd m2, m3 454 paddd m0, m1 455%endif 456%if notcpuflag(xop) 457 PSHUFLW m3, m2, q0032 458 PSHUFLW m1, m0, q0032 459 paddd m2, m3 460 paddd m0, m1 461%endif 462 psubd m2, m0 463 ; This is probably a really bad idea on atom and other machines with a 464 ; long transfer latency between GPRs and XMMs (atom). However, it does 465 ; make the clip a lot simpler... 466 movd eax, m2 467 add indexd, dst_incr_divd 468 imul fracd 469 idiv src_incrd 470 movd m1, eax 471 add fracd, dst_incr_modd 472 paddd m0, m1 473 psrad m0, 15 474 packssdw m0, m0 475 movd [dstq], m0 476 477 ; note that for imul/idiv, I need to move filter to edx/eax for each: 478 ; - 32bit: eax=r0[filter1], edx=r2[filter2] 479 ; - win64: eax=r6[filter1], edx=r1[todo] 480 ; - unix64: eax=r6[filter1], edx=r2[todo] 481%else ; float/double 482 ; val += (v2 - val) * (FELEML) frac / c->src_incr; 483%if mmsize == 32 484 vextractf128 xm1, m0, 0x1 485 vextractf128 xm3, m2, 0x1 486 addps xm0, xm1 487 addps xm2, xm3 488%endif 489 cvtsi2s%4 xm1, fracd 490 subp%4 xm2, xm0 491 mulp%4 xm1, xm4 492 shufp%4 xm1, xm1, q0000 493%if cpuflag(fma4) || cpuflag(fma3) 494 fmaddp%4 xm0, xm2, xm1, xm0 495%else 496 mulp%4 xm2, xm1 497 addp%4 xm0, xm2 498%endif ; cpuflag 499 500 ; horizontal sum & store 501 movhlps xm1, xm0 502%ifidn %1, float 503 addps xm0, xm1 504 shufps xm1, xm0, xm0, q0001 505%endif 506 add fracd, dst_incr_modd 507 addp%4 xm0, xm1 508 add indexd, dst_incr_divd 509 movs%4 [dstq], xm0 510%endif 511 cmp fracd, src_incrd 512 jl .skip 513 sub fracd, src_incrd 514 inc indexd 515 516%if UNIX64 517 DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, index_incr, \ 518 dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ 519 dst_incr_div, src_incr, src, dst_end, filter_bank 520%elif WIN64 521 DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, index_incr, \ 522 dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ 523 dst_incr_div, src_incr, dst, dst_end, filter_bank 524%else ; x86-32 525 DEFINE_ARGS filter1, phase_shift, index_incr, frac, index, dst, src 526%endif 527 528.skip: 529%if ARCH_X86_32 530 mov phase_shiftd, phase_shift_stackd 531%endif 532 mov index_incrd, indexd 533 add dstq, %2 534 and indexd, phase_mask_stackd 535 sar index_incrd, phase_shiftb 536 lea srcq, [srcq+index_incrq*%2] 537 cmp dstq, dst_endq 538 jne .loop 539 540%if UNIX64 541 DEFINE_ARGS ctx, dst, filter2, phase_shift, index, frac, index_incr, \ 542 dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ 543 dst_incr_div, src_incr, src, dst_end, filter_bank 544%elif WIN64 545 DEFINE_ARGS ctx, filter2, src, phase_shift, index, frac, index_incr, \ 546 dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ 547 dst_incr_div, src_incr, dst, dst_end, filter_bank 548%else ; x86-32 549 DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src 550%endif 551 552 cmp dword update_context_stackd, 0 553 jz .skip_store 554 ; strictly speaking, the function should always return the consumed 555 ; number of bytes; however, we only use the value if update_context 556 ; is true, so let's just leave it uninitialized otherwise 557 mov ctxq, ctx_stackq 558 movifnidn rax, srcq 559 mov [ctxq+ResampleContext.frac ], fracd 560 sub rax, src_stackq 561 mov [ctxq+ResampleContext.index], indexd 562 shr rax, %3 563 564.skip_store: 565%if ARCH_X86_32 566 ADD rsp, 0x28 567%endif 568 RET 569%endmacro 570 571INIT_XMM sse 572RESAMPLE_FNS float, 4, 2, s, pf_1 573 574%if HAVE_AVX_EXTERNAL 575INIT_YMM avx 576RESAMPLE_FNS float, 4, 2, s, pf_1 577%endif 578%if HAVE_FMA3_EXTERNAL 579INIT_YMM fma3 580RESAMPLE_FNS float, 4, 2, s, pf_1 581%endif 582%if HAVE_FMA4_EXTERNAL 583INIT_XMM fma4 584RESAMPLE_FNS float, 4, 2, s, pf_1 585%endif 586 587%if ARCH_X86_32 588INIT_MMX mmxext 589RESAMPLE_FNS int16, 2, 1 590%endif 591 592INIT_XMM sse2 593RESAMPLE_FNS int16, 2, 1 594%if HAVE_XOP_EXTERNAL 595INIT_XMM xop 596RESAMPLE_FNS int16, 2, 1 597%endif 598 599INIT_XMM sse2 600RESAMPLE_FNS double, 8, 3, d, pdbl_1 601