1;****************************************************************************** 2;* MMX optimized DSP utils 3;* Copyright (c) 2008 Loren Merritt 4;* 5;* This file is part of Libav. 6;* 7;* Libav is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* Libav is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with Libav; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "x86inc.asm" 23%include "x86util.asm" 24 25SECTION_RODATA 26pb_f: times 16 db 15 27pb_zzzzzzzz77777777: times 8 db -1 28pb_7: times 8 db 7 29pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 30pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 31pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 32pd_16384: times 4 dd 16384 33 34SECTION_TEXT 35 36%macro SCALARPRODUCT 1 37; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) 38cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift 39 shl orderq, 1 40 add v1q, orderq 41 add v2q, orderq 42 neg orderq 43 movd m3, shiftm 44 pxor m2, m2 45.loop: 46 movu m0, [v1q + orderq] 47 movu m1, [v1q + orderq + mmsize] 48 pmaddwd m0, [v2q + orderq] 49 pmaddwd m1, [v2q + orderq + mmsize] 50 paddd m2, m0 51 paddd m2, m1 52 add orderq, mmsize*2 53 jl .loop 54%if mmsize == 16 55 movhlps m0, m2 56 paddd m2, m0 57 psrad m2, m3 58 pshuflw m0, m2, 0x4e 59%else 60 psrad m2, m3 61 pshufw m0, m2, 0x4e 62%endif 63 paddd m2, m0 64 movd eax, m2 65 RET 66 67; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) 68cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul 69 shl orderq, 1 70 movd m7, mulm 71%if mmsize == 16 72 pshuflw m7, m7, 0 73 punpcklqdq m7, m7 74%else 75 pshufw m7, m7, 0 76%endif 77 pxor m6, m6 78 add v1q, orderq 79 add v2q, orderq 80 add v3q, orderq 81 neg orderq 82.loop: 83 movu m0, [v2q + orderq] 84 movu m1, [v2q + orderq + mmsize] 85 mova m4, [v1q + orderq] 86 mova m5, [v1q + orderq + mmsize] 87 movu m2, [v3q + orderq] 88 movu m3, [v3q + orderq + mmsize] 89 pmaddwd m0, m4 90 pmaddwd m1, m5 91 pmullw m2, m7 92 pmullw m3, m7 93 paddd m6, m0 94 paddd m6, m1 95 paddw m2, m4 96 paddw m3, m5 97 mova [v1q + orderq], m2 98 mova [v1q + orderq + mmsize], m3 99 add orderq, mmsize*2 100 jl .loop 101%if mmsize == 16 102 movhlps m0, m6 103 paddd m6, m0 104 pshuflw m0, m6, 0x4e 105%else 106 pshufw m0, m6, 0x4e 107%endif 108 paddd m6, m0 109 movd eax, m6 110 RET 111%endmacro 112 113INIT_MMX 114SCALARPRODUCT mmx2 115INIT_XMM 116SCALARPRODUCT sse2 117 118%macro SCALARPRODUCT_LOOP 1 119align 16 120.loop%1: 121 sub orderq, mmsize*2 122%if %1 123 mova m1, m4 124 mova m4, [v2q + orderq] 125 mova m0, [v2q + orderq + mmsize] 126 palignr m1, m0, %1 127 palignr m0, m4, %1 128 mova m3, m5 129 mova m5, [v3q + orderq] 130 mova m2, [v3q + orderq + mmsize] 131 palignr m3, m2, %1 132 palignr m2, m5, %1 133%else 134 mova m0, [v2q + orderq] 135 mova m1, [v2q + orderq + mmsize] 136 mova m2, [v3q + orderq] 137 mova m3, [v3q + orderq + mmsize] 138%endif 139 %define t0 [v1q + orderq] 140 %define t1 [v1q + orderq + mmsize] 141%ifdef ARCH_X86_64 142 mova m8, t0 143 mova m9, t1 144 %define t0 m8 145 %define t1 m9 146%endif 147 pmaddwd m0, t0 148 pmaddwd m1, t1 149 pmullw m2, m7 150 pmullw m3, m7 151 paddw m2, t0 152 paddw m3, t1 153 paddd m6, m0 154 paddd m6, m1 155 mova [v1q + orderq], m2 156 mova [v1q + orderq + mmsize], m3 157 jg .loop%1 158%if %1 159 jmp .end 160%endif 161%endmacro 162 163; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) 164cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul 165 shl orderq, 1 166 movd m7, mulm 167 pshuflw m7, m7, 0 168 punpcklqdq m7, m7 169 pxor m6, m6 170 mov r4d, v2d 171 and r4d, 15 172 and v2q, ~15 173 and v3q, ~15 174 mova m4, [v2q + orderq] 175 mova m5, [v3q + orderq] 176 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) 177 cmp r4d, 0 178 je .loop0 179 cmp r4d, 2 180 je .loop2 181 cmp r4d, 4 182 je .loop4 183 cmp r4d, 6 184 je .loop6 185 cmp r4d, 8 186 je .loop8 187 cmp r4d, 10 188 je .loop10 189 cmp r4d, 12 190 je .loop12 191SCALARPRODUCT_LOOP 14 192SCALARPRODUCT_LOOP 12 193SCALARPRODUCT_LOOP 10 194SCALARPRODUCT_LOOP 8 195SCALARPRODUCT_LOOP 6 196SCALARPRODUCT_LOOP 4 197SCALARPRODUCT_LOOP 2 198SCALARPRODUCT_LOOP 0 199.end: 200 movhlps m0, m6 201 paddd m6, m0 202 pshuflw m0, m6, 0x4e 203 paddd m6, m0 204 movd eax, m6 205 RET 206 207 208;----------------------------------------------------------------------------- 209; void ff_apply_window_int16(int16_t *output, const int16_t *input, 210; const int16_t *window, unsigned int len) 211;----------------------------------------------------------------------------- 212 213%macro REVERSE_WORDS_MMXEXT 1-2 214 pshufw %1, %1, 0x1B 215%endmacro 216 217%macro REVERSE_WORDS_SSE2 1-2 218 pshuflw %1, %1, 0x1B 219 pshufhw %1, %1, 0x1B 220 pshufd %1, %1, 0x4E 221%endmacro 222 223%macro REVERSE_WORDS_SSSE3 2 224 pshufb %1, %2 225%endmacro 226 227; dst = (dst * src) >> 15 228; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back 229; in from the pmullw result. 230%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp 231 mova %3, %1 232 pmulhw %1, %2 233 pmullw %3, %2 234 psrlw %3, 15 235 psllw %1, 1 236 por %1, %3 237%endmacro 238 239; dst = ((dst * src) + (1<<14)) >> 15 240%macro MUL16FIXED_SSSE3 3 ; dst, src, unused 241 pmulhrsw %1, %2 242%endmacro 243 244%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3 245cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2 246 lea offset2q, [offsetq-mmsize] 247%if %2 248 mova m5, [pd_16384] 249%elifidn %1, ssse3 250 mova m5, [pb_revwords] 251 ALIGN 16 252%endif 253.loop: 254%if %2 255 ; This version expands 16-bit to 32-bit, multiplies by the window, 256 ; adds 16384 for rounding, right shifts 15, then repacks back to words to 257 ; save to the output. The window is reversed for the second half. 258 mova m3, [windowq+offset2q] 259 mova m4, [ inputq+offset2q] 260 pxor m0, m0 261 punpcklwd m0, m3 262 punpcklwd m1, m4 263 pmaddwd m0, m1 264 paddd m0, m5 265 psrad m0, 15 266 pxor m2, m2 267 punpckhwd m2, m3 268 punpckhwd m1, m4 269 pmaddwd m2, m1 270 paddd m2, m5 271 psrad m2, 15 272 packssdw m0, m2 273 mova [outputq+offset2q], m0 274 REVERSE_WORDS m3 275 mova m4, [ inputq+offsetq] 276 pxor m0, m0 277 punpcklwd m0, m3 278 punpcklwd m1, m4 279 pmaddwd m0, m1 280 paddd m0, m5 281 psrad m0, 15 282 pxor m2, m2 283 punpckhwd m2, m3 284 punpckhwd m1, m4 285 pmaddwd m2, m1 286 paddd m2, m5 287 psrad m2, 15 288 packssdw m0, m2 289 mova [outputq+offsetq], m0 290%elif %3 291 ; This version does the 16x16->16 multiplication in-place without expanding 292 ; to 32-bit. The ssse3 version is bit-identical. 293 mova m0, [windowq+offset2q] 294 mova m1, [ inputq+offset2q] 295 pmulhrsw m1, m0 296 REVERSE_WORDS m0, m5 297 pmulhrsw m0, [ inputq+offsetq ] 298 mova [outputq+offset2q], m1 299 mova [outputq+offsetq ], m0 300%else 301 ; This version does the 16x16->16 multiplication in-place without expanding 302 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and 303 ; therefore are not bit-identical to the C version. 304 mova m0, [windowq+offset2q] 305 mova m1, [ inputq+offset2q] 306 mova m2, [ inputq+offsetq ] 307 MUL16FIXED m1, m0, m3 308 REVERSE_WORDS m0 309 MUL16FIXED m2, m0, m3 310 mova [outputq+offset2q], m1 311 mova [outputq+offsetq ], m2 312%endif 313 add offsetd, mmsize 314 sub offset2d, mmsize 315 jae .loop 316 REP_RET 317%endmacro 318 319INIT_MMX 320%define REVERSE_WORDS REVERSE_WORDS_MMXEXT 321%define MUL16FIXED MUL16FIXED_MMXEXT 322APPLY_WINDOW_INT16 mmxext, 0, 0 323APPLY_WINDOW_INT16 mmxext_ba, 1, 0 324INIT_XMM 325%define REVERSE_WORDS REVERSE_WORDS_SSE2 326APPLY_WINDOW_INT16 sse2, 0, 0 327APPLY_WINDOW_INT16 sse2_ba, 1, 0 328APPLY_WINDOW_INT16 ssse3_atom, 0, 1 329%define REVERSE_WORDS REVERSE_WORDS_SSSE3 330APPLY_WINDOW_INT16 ssse3, 0, 1 331 332 333; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) 334cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top 335 movq mm0, [topq] 336 movq mm2, mm0 337 movd mm4, [left_topq] 338 psllq mm2, 8 339 movq mm1, mm0 340 por mm4, mm2 341 movd mm3, [leftq] 342 psubb mm0, mm4 ; t-tl 343 add dstq, wq 344 add topq, wq 345 add diffq, wq 346 neg wq 347 jmp .skip 348.loop: 349 movq mm4, [topq+wq] 350 movq mm0, mm4 351 psllq mm4, 8 352 por mm4, mm1 353 movq mm1, mm0 ; t 354 psubb mm0, mm4 ; t-tl 355.skip: 356 movq mm2, [diffq+wq] 357%assign i 0 358%rep 8 359 movq mm4, mm0 360 paddb mm4, mm3 ; t-tl+l 361 movq mm5, mm3 362 pmaxub mm3, mm1 363 pminub mm5, mm1 364 pminub mm3, mm4 365 pmaxub mm3, mm5 ; median 366 paddb mm3, mm2 ; +residual 367%if i==0 368 movq mm7, mm3 369 psllq mm7, 56 370%else 371 movq mm6, mm3 372 psrlq mm7, 8 373 psllq mm6, 56 374 por mm7, mm6 375%endif 376%if i<7 377 psrlq mm0, 8 378 psrlq mm1, 8 379 psrlq mm2, 8 380%endif 381%assign i i+1 382%endrep 383 movq [dstq+wq], mm7 384 add wq, 8 385 jl .loop 386 movzx r2d, byte [dstq-1] 387 mov [leftq], r2d 388 movzx r2d, byte [topq-1] 389 mov [left_topq], r2d 390 RET 391 392 393%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned 394 add srcq, wq 395 add dstq, wq 396 neg wq 397%%.loop: 398 mova m1, [srcq+wq] 399 mova m2, m1 400 psllw m1, 8 401 paddb m1, m2 402 mova m2, m1 403 pshufb m1, m3 404 paddb m1, m2 405 pshufb m0, m5 406 mova m2, m1 407 pshufb m1, m4 408 paddb m1, m2 409%if mmsize == 16 410 mova m2, m1 411 pshufb m1, m6 412 paddb m1, m2 413%endif 414 paddb m0, m1 415%if %1 416 mova [dstq+wq], m0 417%else 418 movq [dstq+wq], m0 419 movhps [dstq+wq+8], m0 420%endif 421 add wq, mmsize 422 jl %%.loop 423 mov eax, mmsize-1 424 sub eax, wd 425 movd m1, eax 426 pshufb m0, m1 427 movd eax, m0 428 RET 429%endmacro 430 431; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) 432INIT_MMX 433cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left 434.skip_prologue: 435 mova m5, [pb_7] 436 mova m4, [pb_zzzz3333zzzzbbbb] 437 mova m3, [pb_zz11zz55zz99zzdd] 438 movd m0, leftm 439 psllq m0, 56 440 ADD_HFYU_LEFT_LOOP 1 441 442INIT_XMM 443cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left 444 mova m5, [pb_f] 445 mova m6, [pb_zzzzzzzz77777777] 446 mova m4, [pb_zzzz3333zzzzbbbb] 447 mova m3, [pb_zz11zz55zz99zzdd] 448 movd m0, leftm 449 pslldq m0, 15 450 test srcq, 15 451 jnz add_hfyu_left_prediction_ssse3.skip_prologue 452 test dstq, 15 453 jnz .unaligned 454 ADD_HFYU_LEFT_LOOP 1 455.unaligned: 456 ADD_HFYU_LEFT_LOOP 0 457 458 459; float scalarproduct_float_sse(const float *v1, const float *v2, int len) 460cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset 461 neg offsetq 462 shl offsetq, 2 463 sub v1q, offsetq 464 sub v2q, offsetq 465 xorps xmm0, xmm0 466 .loop: 467 movaps xmm1, [v1q+offsetq] 468 mulps xmm1, [v2q+offsetq] 469 addps xmm0, xmm1 470 add offsetq, 16 471 js .loop 472 movhlps xmm1, xmm0 473 addps xmm0, xmm1 474 movss xmm1, xmm0 475 shufps xmm0, xmm0, 1 476 addss xmm0, xmm1 477%ifndef ARCH_X86_64 478 movd r0m, xmm0 479 fld dword r0m 480%endif 481 RET 482 483; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize, 484; x86_reg start_y, x86_reg end_y, x86_reg block_h, 485; x86_reg start_x, x86_reg end_x, x86_reg block_w); 486; 487; The actual function itself is below. It basically wraps a very simple 488; w = end_x - start_x 489; if (w) { 490; if (w > 22) { 491; jump to the slow loop functions 492; } else { 493; jump to the fast loop functions 494; } 495; } 496; 497; ... and then the same for left/right extend also. See below for loop 498; function implementations. Fast are fixed-width, slow is variable-width 499 500%macro EMU_EDGE_FUNC 0 501%ifdef ARCH_X86_64 502%define w_reg r10 503cglobal emu_edge_core, 6, 7, 1 504 mov r11, r5 ; save block_h 505%else 506%define w_reg r6 507cglobal emu_edge_core, 2, 7, 0 508 mov r4, r4m ; end_y 509 mov r5, r5m ; block_h 510%endif 511 512 ; start with vertical extend (top/bottom) and body pixel copy 513 mov w_reg, r7m 514 sub w_reg, r6m ; w = start_x - end_x 515 sub r5, r4 516%ifdef ARCH_X86_64 517 sub r4, r3 518%else 519 sub r4, dword r3m 520%endif 521 cmp w_reg, 22 522 jg .slow_v_extend_loop 523%ifdef ARCH_X86_32 524 mov r2, r2m ; linesize 525%endif 526 sal w_reg, 7 ; w * 128 527%ifdef PIC 528 lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)] 529 add w_reg, rax 530%else 531 lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg] 532%endif 533 call w_reg ; fast top extend, body copy and bottom extend 534.v_extend_end: 535 536 ; horizontal extend (left/right) 537 mov w_reg, r6m ; start_x 538 sub r0, w_reg 539%ifdef ARCH_X86_64 540 mov r3, r0 ; backup of buf+block_h*linesize 541 mov r5, r11 542%else 543 mov r0m, r0 ; backup of buf+block_h*linesize 544 mov r5, r5m 545%endif 546 test w_reg, w_reg 547 jz .right_extend 548 cmp w_reg, 22 549 jg .slow_left_extend_loop 550 mov r1, w_reg 551 dec w_reg 552 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me 553 sar w_reg, 1 554 sal w_reg, 6 555 ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs 556 ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h 557%ifdef PIC 558 lea rax, [.emuedge_extend_left_2] 559 add w_reg, rax 560%else 561 lea w_reg, [.emuedge_extend_left_2+w_reg] 562%endif 563 call w_reg 564 565 ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w 566.right_extend: 567%ifdef ARCH_X86_32 568 mov r0, r0m 569 mov r5, r5m 570%endif 571 mov w_reg, r7m ; end_x 572 mov r1, r8m ; block_w 573 mov r4, r1 574 sub r1, w_reg 575 jz .h_extend_end ; if (end_x == block_w) goto h_extend_end 576 cmp r1, 22 577 jg .slow_right_extend_loop 578 dec r1 579 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me 580 sar r1, 1 581 sal r1, 6 582%ifdef PIC 583 lea rax, [.emuedge_extend_right_2] 584 add r1, rax 585%else 586 lea r1, [.emuedge_extend_right_2+r1] 587%endif 588 call r1 589.h_extend_end: 590 RET 591 592%ifdef ARCH_X86_64 593%define vall al 594%define valh ah 595%define valw ax 596%define valw2 r10w 597%define valw3 r3w 598%ifdef WIN64 599%define valw4 r4w 600%else ; unix64 601%define valw4 r3w 602%endif 603%define vald eax 604%else 605%define vall bl 606%define valh bh 607%define valw bx 608%define valw2 r6w 609%define valw3 valw2 610%define valw4 valw3 611%define vald ebx 612%define stack_offset 0x14 613%endif 614 615%endmacro 616 617; macro to read/write a horizontal number of pixels (%2) to/from registers 618; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels 619; - if (%2 & 15 == 8) fills the last 8 bytes into rax 620; - else if (%2 & 8) fills 8 bytes into mm0 621; - if (%2 & 7 == 4) fills the last 4 bytes into rax 622; - else if (%2 & 4) fills 4 bytes into mm0-1 623; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax 624; (note that we're using r3 for body/bottom because it's a shorter 625; opcode, and then the loop fits in 128 bytes) 626; - else fills remaining bytes into rax 627; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels 628; - if (%2 & 7 == 4) fills 4 bytes into ebx 629; - else if (%2 & 4) fills 4 bytes into mm0-7 630; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx 631; - else fills remaining bytes into ebx 632; writing data out is in the same way 633%macro READ_NUM_BYTES 2 634%assign %%src_off 0 ; offset in source buffer 635%assign %%smidx 0 ; mmx register idx 636%assign %%sxidx 0 ; xmm register idx 637 638%if cpuflag(sse) 639%rep %2/16 640 movups xmm %+ %%sxidx, [r1+%%src_off] 641%assign %%src_off %%src_off+16 642%assign %%sxidx %%sxidx+1 643%endrep ; %2/16 644%endif 645 646%ifdef ARCH_X86_64 647%if (%2-%%src_off) == 8 648 mov rax, [r1+%%src_off] 649%assign %%src_off %%src_off+8 650%endif ; (%2-%%src_off) == 8 651%endif ; x86-64 652 653%rep (%2-%%src_off)/8 654 movq mm %+ %%smidx, [r1+%%src_off] 655%assign %%src_off %%src_off+8 656%assign %%smidx %%smidx+1 657%endrep ; (%2-%%dst_off)/8 658 659%if (%2-%%src_off) == 4 660 mov vald, [r1+%%src_off] 661%elif (%2-%%src_off) & 4 662 movd mm %+ %%smidx, [r1+%%src_off] 663%assign %%src_off %%src_off+4 664%endif ; (%2-%%src_off) ==/& 4 665 666%if (%2-%%src_off) == 1 667 mov vall, [r1+%%src_off] 668%elif (%2-%%src_off) == 2 669 mov valw, [r1+%%src_off] 670%elif (%2-%%src_off) == 3 671%ifidn %1, top 672 mov valw2, [r1+%%src_off] 673%elifidn %1, body 674 mov valw3, [r1+%%src_off] 675%elifidn %1, bottom 676 mov valw4, [r1+%%src_off] 677%endif ; %1 ==/!= top 678 mov vall, [r1+%%src_off+2] 679%endif ; (%2-%%src_off) == 1/2/3 680%endmacro ; READ_NUM_BYTES 681 682%macro WRITE_NUM_BYTES 2 683%assign %%dst_off 0 ; offset in destination buffer 684%assign %%dmidx 0 ; mmx register idx 685%assign %%dxidx 0 ; xmm register idx 686 687%if cpuflag(sse) 688%rep %2/16 689 movups [r0+%%dst_off], xmm %+ %%dxidx 690%assign %%dst_off %%dst_off+16 691%assign %%dxidx %%dxidx+1 692%endrep ; %2/16 693%endif 694 695%ifdef ARCH_X86_64 696%if (%2-%%dst_off) == 8 697 mov [r0+%%dst_off], rax 698%assign %%dst_off %%dst_off+8 699%endif ; (%2-%%dst_off) == 8 700%endif ; x86-64 701 702%rep (%2-%%dst_off)/8 703 movq [r0+%%dst_off], mm %+ %%dmidx 704%assign %%dst_off %%dst_off+8 705%assign %%dmidx %%dmidx+1 706%endrep ; (%2-%%dst_off)/8 707 708%if (%2-%%dst_off) == 4 709 mov [r0+%%dst_off], vald 710%elif (%2-%%dst_off) & 4 711 movd [r0+%%dst_off], mm %+ %%dmidx 712%assign %%dst_off %%dst_off+4 713%endif ; (%2-%%dst_off) ==/& 4 714 715%if (%2-%%dst_off) == 1 716 mov [r0+%%dst_off], vall 717%elif (%2-%%dst_off) == 2 718 mov [r0+%%dst_off], valw 719%elif (%2-%%dst_off) == 3 720%ifidn %1, top 721 mov [r0+%%dst_off], valw2 722%elifidn %1, body 723 mov [r0+%%dst_off], valw3 724%elifidn %1, bottom 725 mov [r0+%%dst_off], valw4 726%endif ; %1 ==/!= top 727 mov [r0+%%dst_off+2], vall 728%endif ; (%2-%%dst_off) == 1/2/3 729%endmacro ; WRITE_NUM_BYTES 730 731; vertical top/bottom extend and body copy fast loops 732; these are function pointers to set-width line copy functions, i.e. 733; they read a fixed number of pixels into set registers, and write 734; those out into the destination buffer 735; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h 736; r6(eax/64)/r3(ebx/32)=val_reg 737%macro VERTICAL_EXTEND 0 738%assign %%n 1 739%rep 22 740ALIGN 128 741.emuedge_v_extend_ %+ %%n: 742 ; extend pixels above body 743%ifdef ARCH_X86_64 744 test r3 , r3 ; if (!start_y) 745 jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body 746%else ; ARCH_X86_32 747 cmp dword r3m, 0 748 je .emuedge_copy_body_ %+ %%n %+ _loop 749%endif ; ARCH_X86_64/32 750 READ_NUM_BYTES top, %%n ; read bytes 751.emuedge_extend_top_ %+ %%n %+ _loop: ; do { 752 WRITE_NUM_BYTES top, %%n ; write bytes 753 add r0 , r2 ; dst += linesize 754%ifdef ARCH_X86_64 755 dec r3d 756%else ; ARCH_X86_32 757 dec dword r3m 758%endif ; ARCH_X86_64/32 759 jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y) 760 761 ; copy body pixels 762.emuedge_copy_body_ %+ %%n %+ _loop: ; do { 763 READ_NUM_BYTES body, %%n ; read bytes 764 WRITE_NUM_BYTES body, %%n ; write bytes 765 add r0 , r2 ; dst += linesize 766 add r1 , r2 ; src += linesize 767 dec r4d 768 jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y) 769 770 ; copy bottom pixels 771 test r5 , r5 ; if (!block_h) 772 jz .emuedge_v_extend_end_ %+ %%n ; goto end 773 sub r1 , r2 ; src -= linesize 774 READ_NUM_BYTES bottom, %%n ; read bytes 775.emuedge_extend_bottom_ %+ %%n %+ _loop: ; do { 776 WRITE_NUM_BYTES bottom, %%n ; write bytes 777 add r0 , r2 ; dst += linesize 778 dec r5d 779 jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h) 780 781.emuedge_v_extend_end_ %+ %%n: 782%ifdef ARCH_X86_64 783 ret 784%else ; ARCH_X86_32 785 rep ret 786%endif ; ARCH_X86_64/32 787%assign %%n %%n+1 788%endrep 789%endmacro VERTICAL_EXTEND 790 791; left/right (horizontal) fast extend functions 792; these are essentially identical to the vertical extend ones above, 793; just left/right separated because number of pixels to extend is 794; obviously not the same on both sides. 795; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the 796; lowest two bytes of the register (so val*0x0101), and are splatted 797; into each byte of mm0 as well if n_pixels >= 8 798 799%macro READ_V_PIXEL 2 800 mov vall, %2 801 mov valh, vall 802%if %1 >= 8 803 movd mm0, vald 804%if cpuflag(mmx2) 805 pshufw mm0, mm0, 0 806%else ; mmx 807 punpcklwd mm0, mm0 808 punpckldq mm0, mm0 809%endif ; sse 810%endif ; %1 >= 8 811%endmacro 812 813%macro WRITE_V_PIXEL 2 814%assign %%dst_off 0 815%rep %1/8 816 movq [%2+%%dst_off], mm0 817%assign %%dst_off %%dst_off+8 818%endrep 819%if %1 & 4 820%if %1 >= 8 821 movd [%2+%%dst_off], mm0 822%else ; %1 < 8 823 mov [%2+%%dst_off] , valw 824 mov [%2+%%dst_off+2], valw 825%endif ; %1 >=/< 8 826%assign %%dst_off %%dst_off+4 827%endif ; %1 & 4 828%if %1&2 829 mov [%2+%%dst_off], valw 830%endif ; %1 & 2 831%endmacro 832 833; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val 834%macro LEFT_EXTEND 0 835%assign %%n 2 836%rep 11 837ALIGN 64 838.emuedge_extend_left_ %+ %%n: ; do { 839 sub r0, r2 ; dst -= linesize 840 READ_V_PIXEL %%n, [r0+r1] ; read pixels 841 WRITE_V_PIXEL %%n, r0 ; write pixels 842 dec r5 843 jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h) 844%ifdef ARCH_X86_64 845 ret 846%else ; ARCH_X86_32 847 rep ret 848%endif ; ARCH_X86_64/32 849%assign %%n %%n+2 850%endrep 851%endmacro ; LEFT_EXTEND 852 853; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val 854%macro RIGHT_EXTEND 0 855%assign %%n 2 856%rep 11 857ALIGN 64 858.emuedge_extend_right_ %+ %%n: ; do { 859%ifdef ARCH_X86_64 860 sub r3, r2 ; dst -= linesize 861 READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels 862 WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels 863 dec r11 864%else ; ARCH_X86_32 865 sub r0, r2 ; dst -= linesize 866 READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels 867 WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels 868 dec r5 869%endif ; ARCH_X86_64/32 870 jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h) 871%ifdef ARCH_X86_64 872 ret 873%else ; ARCH_X86_32 874 rep ret 875%endif ; ARCH_X86_64/32 876%assign %%n %%n+2 877%endrep 878 879%ifdef ARCH_X86_32 880%define stack_offset 0x10 881%endif 882%endmacro ; RIGHT_EXTEND 883 884; below follow the "slow" copy/extend functions, these act on a non-fixed 885; width specified in a register, and run a loop to copy the full amount 886; of bytes. They are optimized for copying of large amounts of pixels per 887; line, so they unconditionally splat data into mm registers to copy 8 888; bytes per loop iteration. It could be considered to use xmm for x86-64 889; also, but I haven't optimized this as much (i.e. FIXME) 890%macro V_COPY_NPX 4-5 891%if %0 == 4 892 test w_reg, %4 893 jz .%1_skip_%4_px 894%else ; %0 == 5 895.%1_%4_px_loop: 896%endif 897 %3 %2, [r1+cnt_reg] 898 %3 [r0+cnt_reg], %2 899 add cnt_reg, %4 900%if %0 == 5 901 sub w_reg, %4 902 test w_reg, %5 903 jnz .%1_%4_px_loop 904%endif 905.%1_skip_%4_px: 906%endmacro 907 908%macro V_COPY_ROW 2 909%ifidn %1, bottom 910 sub r1, linesize 911%endif 912.%1_copy_loop: 913 xor cnt_reg, cnt_reg 914%if notcpuflag(sse) 915%define linesize r2m 916 V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8 917%else ; sse 918 V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0 919%ifdef ARCH_X86_64 920%define linesize r2 921 V_COPY_NPX %1, rax , mov, 8 922%else ; ARCH_X86_32 923%define linesize r2m 924 V_COPY_NPX %1, mm0, movq, 8 925%endif ; ARCH_X86_64/32 926%endif ; sse 927 V_COPY_NPX %1, vald, mov, 4 928 V_COPY_NPX %1, valw, mov, 2 929 V_COPY_NPX %1, vall, mov, 1 930 mov w_reg, cnt_reg 931%ifidn %1, body 932 add r1, linesize 933%endif 934 add r0, linesize 935 dec %2 936 jnz .%1_copy_loop 937%endmacro 938 939%macro SLOW_V_EXTEND 0 940.slow_v_extend_loop: 941; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h 942; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x 943%ifdef ARCH_X86_64 944 push r11 ; save old value of block_h 945 test r3, r3 946%define cnt_reg r11 947 jz .do_body_copy ; if (!start_y) goto do_body_copy 948 V_COPY_ROW top, r3 949%else 950 cmp dword r3m, 0 951%define cnt_reg r2 952 je .do_body_copy ; if (!start_y) goto do_body_copy 953 V_COPY_ROW top, dword r3m 954%endif 955 956.do_body_copy: 957 V_COPY_ROW body, r4 958 959%ifdef ARCH_X86_64 960 pop r11 ; restore old value of block_h 961%define cnt_reg r3 962%endif 963 test r5, r5 964%ifdef ARCH_X86_64 965 jz .v_extend_end 966%else 967 jz .skip_bottom_extend 968%endif 969 V_COPY_ROW bottom, r5 970%ifdef ARCH_X86_32 971.skip_bottom_extend: 972 mov r2, r2m 973%endif 974 jmp .v_extend_end 975%endmacro 976 977%macro SLOW_LEFT_EXTEND 0 978.slow_left_extend_loop: 979; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x 980 mov r4, 8 981 sub r0, linesize 982 READ_V_PIXEL 8, [r0+w_reg] 983.left_extend_8px_loop: 984 movq [r0+r4-8], mm0 985 add r4, 8 986 cmp r4, w_reg 987 jle .left_extend_8px_loop 988 sub r4, 8 989 cmp r4, w_reg 990 jge .left_extend_loop_end 991.left_extend_2px_loop: 992 mov [r0+r4], valw 993 add r4, 2 994 cmp r4, w_reg 995 jl .left_extend_2px_loop 996.left_extend_loop_end: 997 dec r5 998 jnz .slow_left_extend_loop 999%ifdef ARCH_X86_32 1000 mov r2, r2m 1001%endif 1002 jmp .right_extend 1003%endmacro 1004 1005%macro SLOW_RIGHT_EXTEND 0 1006.slow_right_extend_loop: 1007; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h, 1008; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr 1009%ifdef ARCH_X86_64 1010%define buf_reg r3 1011%define bh_reg r11 1012%else 1013%define buf_reg r0 1014%define bh_reg r5 1015%endif 1016 lea r1, [r4-8] 1017 sub buf_reg, linesize 1018 READ_V_PIXEL 8, [buf_reg+w_reg-1] 1019.right_extend_8px_loop: 1020 movq [buf_reg+r1], mm0 1021 sub r1, 8 1022 cmp r1, w_reg 1023 jge .right_extend_8px_loop 1024 add r1, 8 1025 cmp r1, w_reg 1026 je .right_extend_loop_end 1027.right_extend_2px_loop: 1028 sub r1, 2 1029 mov [buf_reg+r1], valw 1030 cmp r1, w_reg 1031 jg .right_extend_2px_loop 1032.right_extend_loop_end: 1033 dec bh_reg 1034 jnz .slow_right_extend_loop 1035 jmp .h_extend_end 1036%endmacro 1037 1038%macro emu_edge 1 1039INIT_XMM %1 1040EMU_EDGE_FUNC 1041VERTICAL_EXTEND 1042LEFT_EXTEND 1043RIGHT_EXTEND 1044SLOW_V_EXTEND 1045SLOW_LEFT_EXTEND 1046SLOW_RIGHT_EXTEND 1047%endmacro 1048 1049emu_edge sse 1050%ifdef ARCH_X86_32 1051emu_edge mmx 1052%endif 1053 1054;----------------------------------------------------------------------------- 1055; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, 1056; int32_t max, unsigned int len) 1057;----------------------------------------------------------------------------- 1058 1059; %1 = number of xmm registers used 1060; %2 = number of inline load/process/store loops per asm loop 1061; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop 1062; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2) 1063; %5 = suffix 1064%macro VECTOR_CLIP_INT32 4-5 1065cglobal vector_clip_int32%5, 5,5,%2, dst, src, min, max, len 1066%if %4 1067 cvtsi2ss m4, minm 1068 cvtsi2ss m5, maxm 1069%else 1070 movd m4, minm 1071 movd m5, maxm 1072%endif 1073 SPLATD m4 1074 SPLATD m5 1075.loop: 1076%assign %%i 1 1077%rep %2 1078 mova m0, [srcq+mmsize*0*%%i] 1079 mova m1, [srcq+mmsize*1*%%i] 1080 mova m2, [srcq+mmsize*2*%%i] 1081 mova m3, [srcq+mmsize*3*%%i] 1082%if %3 1083 mova m7, [srcq+mmsize*4*%%i] 1084 mova m8, [srcq+mmsize*5*%%i] 1085 mova m9, [srcq+mmsize*6*%%i] 1086 mova m10, [srcq+mmsize*7*%%i] 1087%endif 1088 CLIPD m0, m4, m5, m6 1089 CLIPD m1, m4, m5, m6 1090 CLIPD m2, m4, m5, m6 1091 CLIPD m3, m4, m5, m6 1092%if %3 1093 CLIPD m7, m4, m5, m6 1094 CLIPD m8, m4, m5, m6 1095 CLIPD m9, m4, m5, m6 1096 CLIPD m10, m4, m5, m6 1097%endif 1098 mova [dstq+mmsize*0*%%i], m0 1099 mova [dstq+mmsize*1*%%i], m1 1100 mova [dstq+mmsize*2*%%i], m2 1101 mova [dstq+mmsize*3*%%i], m3 1102%if %3 1103 mova [dstq+mmsize*4*%%i], m7 1104 mova [dstq+mmsize*5*%%i], m8 1105 mova [dstq+mmsize*6*%%i], m9 1106 mova [dstq+mmsize*7*%%i], m10 1107%endif 1108%assign %%i %%i+1 1109%endrep 1110 add srcq, mmsize*4*(%2+%3) 1111 add dstq, mmsize*4*(%2+%3) 1112 sub lend, mmsize*(%2+%3) 1113 jg .loop 1114 REP_RET 1115%endmacro 1116 1117INIT_MMX mmx 1118%define SPLATD SPLATD_MMX 1119%define CLIPD CLIPD_MMX 1120VECTOR_CLIP_INT32 0, 1, 0, 0 1121INIT_XMM sse2 1122%define SPLATD SPLATD_SSE2 1123VECTOR_CLIP_INT32 6, 1, 0, 0, _int 1124%define CLIPD CLIPD_SSE2 1125VECTOR_CLIP_INT32 6, 2, 0, 1 1126INIT_XMM sse4 1127%define CLIPD CLIPD_SSE41 1128%ifdef m8 1129VECTOR_CLIP_INT32 11, 1, 1, 0 1130%else 1131VECTOR_CLIP_INT32 6, 1, 0, 0 1132%endif 1133 1134;----------------------------------------------------------------------------- 1135; void ff_butterflies_float_interleave(float *dst, const float *src0, 1136; const float *src1, int len); 1137;----------------------------------------------------------------------------- 1138 1139%macro BUTTERFLIES_FLOAT_INTERLEAVE 0 1140cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len 1141%ifdef ARCH_X86_64 1142 movsxd lenq, lend 1143%endif 1144 test lenq, lenq 1145 jz .end 1146 shl lenq, 2 1147 lea src0q, [src0q + lenq] 1148 lea src1q, [src1q + lenq] 1149 lea dstq, [ dstq + 2*lenq] 1150 neg lenq 1151.loop: 1152 mova m0, [src0q + lenq] 1153 mova m1, [src1q + lenq] 1154 subps m2, m0, m1 1155 addps m0, m0, m1 1156 unpcklps m1, m0, m2 1157 unpckhps m0, m0, m2 1158%if cpuflag(avx) 1159 vextractf128 [dstq + 2*lenq ], m1, 0 1160 vextractf128 [dstq + 2*lenq + 16], m0, 0 1161 vextractf128 [dstq + 2*lenq + 32], m1, 1 1162 vextractf128 [dstq + 2*lenq + 48], m0, 1 1163%else 1164 mova [dstq + 2*lenq ], m1 1165 mova [dstq + 2*lenq + mmsize], m0 1166%endif 1167 add lenq, mmsize 1168 jl .loop 1169%if mmsize == 32 1170 vzeroupper 1171 RET 1172%endif 1173.end: 1174 REP_RET 1175%endmacro 1176 1177INIT_XMM sse 1178BUTTERFLIES_FLOAT_INTERLEAVE 1179INIT_YMM avx 1180BUTTERFLIES_FLOAT_INTERLEAVE 1181