1;****************************************************************************** 2;* Core video DSP functions 3;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION .text 25 26; slow vertical extension loop function. Works with variable-width, and 27; does per-line reading/writing of source data 28 29%macro V_COPY_ROW 2 ; type (top/body/bottom), h 30.%1_y_loop: ; do { 31 mov wq, r7mp ; initialize w (r7mp = wmp) 32.%1_x_loop: ; do { 33 movu m0, [srcq+wq] ; m0 = read($mmsize) 34 movu [dstq+wq], m0 ; write(m0, $mmsize) 35 add wq, mmsize ; w -= $mmsize 36 cmp wq, -mmsize ; } while (w > $mmsize); 37 jl .%1_x_loop 38 movu m0, [srcq-mmsize] ; m0 = read($mmsize) 39 movu [dstq-mmsize], m0 ; write(m0, $mmsize) 40%ifidn %1, body ; if ($type == body) { 41 add srcq, src_strideq ; src += src_stride 42%endif ; } 43 add dstq, dst_strideq ; dst += dst_stride 44 dec %2 ; } while (--$h); 45 jnz .%1_y_loop 46%endmacro 47 48%macro vvar_fn 0 49; .----. <- zero 50; | | <- top is copied from first line in body of source 51; |----| <- start_y 52; | | <- body is copied verbatim (line-by-line) from source 53; |----| <- end_y 54; | | <- bottom is copied from last line in body of source 55; '----' <- bh 56%if ARCH_X86_64 57cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ 58 start_y, end_y, bh, w 59%else ; x86-32 60cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w 61%define src_strideq r3mp 62%define dst_strideq r1mp 63 mov srcq, r2mp 64 mov start_yq, r4mp 65 mov end_yq, r5mp 66 mov bhq, r6mp 67%endif 68 sub bhq, end_yq ; bh -= end_q 69 sub end_yq, start_yq ; end_q -= start_q 70 add srcq, r7mp ; (r7mp = wmp) 71 add dstq, r7mp ; (r7mp = wmp) 72 neg r7mp ; (r7mp = wmp) 73 test start_yq, start_yq ; if (start_q) { 74 jz .body 75 V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) 76.body: ; } 77 V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) 78 test bhq, bhq ; if (bh) { 79 jz .end 80 sub srcq, src_strideq ; src -= src_stride 81 V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) 82.end: ; } 83 RET 84%endmacro 85 86%if ARCH_X86_32 87INIT_MMX mmx 88vvar_fn 89%endif 90 91INIT_XMM sse 92vvar_fn 93 94%macro hvar_fn 0 95cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w 96 lea dstq, [dstq+n_wordsq*2] 97 neg n_wordsq 98 lea start_xq, [start_xq+n_wordsq*2] 99.y_loop: ; do { 100 ; FIXME also write a ssse3 version using pshufb 101 movzx wd, byte [dstq+start_xq] ; w = read(1) 102 imul wd, 0x01010101 ; w *= 0x01010101 103 movd m0, wd 104 mov wq, n_wordsq ; initialize w 105%if cpuflag(sse2) 106 pshufd m0, m0, q0000 ; splat 107%else ; mmx 108 punpckldq m0, m0 ; splat 109%endif ; mmx/sse 110.x_loop: ; do { 111 movu [dstq+wq*2], m0 ; write($reg, $mmsize) 112 add wq, mmsize/2 ; w -= $mmsize/2 113 cmp wq, -mmsize/2 ; } while (w > $mmsize/2) 114 jl .x_loop 115 movu [dstq-mmsize], m0 ; write($reg, $mmsize) 116 add dstq, dst_strideq ; dst += dst_stride 117 dec hq ; } while (h--) 118 jnz .y_loop 119 RET 120%endmacro 121 122%if ARCH_X86_32 123INIT_MMX mmx 124hvar_fn 125%endif 126 127INIT_XMM sse2 128hvar_fn 129 130; macro to read/write a horizontal number of pixels (%2) to/from registers 131; on sse, - fills xmm0-15 for consecutive sets of 16 pixels 132; - if (%2 & 8) fills 8 bytes into xmm$next 133; - if (%2 & 4) fills 4 bytes into xmm$next 134; - if (%2 & 3) fills 1, 2 or 4 bytes in eax 135; on mmx, - fills mm0-7 for consecutive sets of 8 pixels 136; - if (%2 & 4) fills 4 bytes into mm$next 137; - if (%2 & 3) fills 1, 2 or 4 bytes in eax 138; writing data out is in the same way 139%macro READ_NUM_BYTES 2 140%assign %%off 0 ; offset in source buffer 141%assign %%mmx_idx 0 ; mmx register index 142%assign %%xmm_idx 0 ; xmm register index 143 144%rep %2/mmsize 145%if mmsize == 16 146 movu xmm %+ %%xmm_idx, [srcq+%%off] 147%assign %%xmm_idx %%xmm_idx+1 148%else ; mmx 149 movu mm %+ %%mmx_idx, [srcq+%%off] 150%assign %%mmx_idx %%mmx_idx+1 151%endif 152%assign %%off %%off+mmsize 153%endrep ; %2/mmsize 154 155%if mmsize == 16 156%if (%2-%%off) >= 8 157%if %2 > 16 && (%2-%%off) > 8 158 movu xmm %+ %%xmm_idx, [srcq+%2-16] 159%assign %%xmm_idx %%xmm_idx+1 160%assign %%off %2 161%else 162 movq mm %+ %%mmx_idx, [srcq+%%off] 163%assign %%mmx_idx %%mmx_idx+1 164%assign %%off %%off+8 165%endif 166%endif ; (%2-%%off) >= 8 167%endif 168 169%if (%2-%%off) >= 4 170%if %2 > 8 && (%2-%%off) > 4 171 movq mm %+ %%mmx_idx, [srcq+%2-8] 172%assign %%off %2 173%else 174 movd mm %+ %%mmx_idx, [srcq+%%off] 175%assign %%off %%off+4 176%endif 177%assign %%mmx_idx %%mmx_idx+1 178%endif ; (%2-%%off) >= 4 179 180%if (%2-%%off) >= 1 181%if %2 >= 4 182 movd mm %+ %%mmx_idx, [srcq+%2-4] 183%elif (%2-%%off) == 1 184 mov valb, [srcq+%2-1] 185%elif (%2-%%off) == 2 186 mov valw, [srcq+%2-2] 187%elifidn %1, body 188 mov vald, [srcq+%2-3] 189%else 190 movd mm %+ %%mmx_idx, [srcq+%2-3] 191%endif 192%endif ; (%2-%%off) >= 1 193%endmacro ; READ_NUM_BYTES 194 195%macro WRITE_NUM_BYTES 2 196%assign %%off 0 ; offset in destination buffer 197%assign %%mmx_idx 0 ; mmx register index 198%assign %%xmm_idx 0 ; xmm register index 199 200%rep %2/mmsize 201%if mmsize == 16 202 movu [dstq+%%off], xmm %+ %%xmm_idx 203%assign %%xmm_idx %%xmm_idx+1 204%else ; mmx 205 movu [dstq+%%off], mm %+ %%mmx_idx 206%assign %%mmx_idx %%mmx_idx+1 207%endif 208%assign %%off %%off+mmsize 209%endrep ; %2/mmsize 210 211%if mmsize == 16 212%if (%2-%%off) >= 8 213%if %2 > 16 && (%2-%%off) > 8 214 movu [dstq+%2-16], xmm %+ %%xmm_idx 215%assign %%xmm_idx %%xmm_idx+1 216%assign %%off %2 217%else 218 movq [dstq+%%off], mm %+ %%mmx_idx 219%assign %%mmx_idx %%mmx_idx+1 220%assign %%off %%off+8 221%endif 222%endif ; (%2-%%off) >= 8 223%endif 224 225%if (%2-%%off) >= 4 226%if %2 > 8 && (%2-%%off) > 4 227 movq [dstq+%2-8], mm %+ %%mmx_idx 228%assign %%off %2 229%else 230 movd [dstq+%%off], mm %+ %%mmx_idx 231%assign %%off %%off+4 232%endif 233%assign %%mmx_idx %%mmx_idx+1 234%endif ; (%2-%%off) >= 4 235 236%if (%2-%%off) >= 1 237%if %2 >= 4 238 movd [dstq+%2-4], mm %+ %%mmx_idx 239%elif (%2-%%off) == 1 240 mov [dstq+%2-1], valb 241%elif (%2-%%off) == 2 242 mov [dstq+%2-2], valw 243%elifidn %1, body 244 mov [dstq+%2-3], valw 245 shr vald, 16 246 mov [dstq+%2-1], valb 247%else 248 movd vald, mm %+ %%mmx_idx 249 mov [dstq+%2-3], valw 250 shr vald, 16 251 mov [dstq+%2-1], valb 252%endif 253%endif ; (%2-%%off) >= 1 254%endmacro ; WRITE_NUM_BYTES 255 256; vertical top/bottom extend and body copy fast loops 257; these are function pointers to set-width line copy functions, i.e. 258; they read a fixed number of pixels into set registers, and write 259; those out into the destination buffer 260%macro VERTICAL_EXTEND 2 261%assign %%n %1 262%rep 1+%2-%1 263%if %%n <= 3 264%if ARCH_X86_64 265cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ 266 start_y, end_y, val, bh 267 mov bhq, r6mp ; r6mp = bhmp 268%else ; x86-32 269cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh 270 mov dstq, r0mp 271 mov srcq, r2mp 272 mov start_yq, r4mp 273 mov end_yq, r5mp 274 mov bhq, r6mp 275%define dst_strideq r1mp 276%define src_strideq r3mp 277%endif ; x86-64/32 278%else 279%if ARCH_X86_64 280cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ 281 start_y, end_y, bh 282%else ; x86-32 283cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh 284 mov srcq, r2mp 285 mov start_yq, r4mp 286 mov end_yq, r5mp 287 mov bhq, r6mp 288%define dst_strideq r1mp 289%define src_strideq r3mp 290%endif ; x86-64/32 291%endif 292 ; FIXME move this to c wrapper? 293 sub bhq, end_yq ; bh -= end_y 294 sub end_yq, start_yq ; end_y -= start_y 295 296 ; extend pixels above body 297 test start_yq, start_yq ; if (start_y) { 298 jz .body_loop 299 READ_NUM_BYTES top, %%n ; $variable_regs = read($n) 300.top_loop: ; do { 301 WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) 302 add dstq, dst_strideq ; dst += linesize 303 dec start_yq ; } while (--start_y) 304 jnz .top_loop ; } 305 306 ; copy body pixels 307.body_loop: ; do { 308 READ_NUM_BYTES body, %%n ; $variable_regs = read($n) 309 WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) 310 add dstq, dst_strideq ; dst += dst_stride 311 add srcq, src_strideq ; src += src_stride 312 dec end_yq ; } while (--end_y) 313 jnz .body_loop 314 315 ; copy bottom pixels 316 test bhq, bhq ; if (block_h) { 317 jz .end 318 sub srcq, src_strideq ; src -= linesize 319 READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) 320.bottom_loop: ; do { 321 WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) 322 add dstq, dst_strideq ; dst += linesize 323 dec bhq ; } while (--bh) 324 jnz .bottom_loop ; } 325 326.end: 327 RET 328%assign %%n %%n+1 329%endrep ; 1+%2-%1 330%endmacro ; VERTICAL_EXTEND 331 332INIT_MMX mmx 333VERTICAL_EXTEND 1, 15 334%if ARCH_X86_32 335VERTICAL_EXTEND 16, 22 336%endif 337 338INIT_XMM sse 339VERTICAL_EXTEND 16, 22 340 341; left/right (horizontal) fast extend functions 342; these are essentially identical to the vertical extend ones above, 343; just left/right separated because number of pixels to extend is 344; obviously not the same on both sides. 345 346%macro READ_V_PIXEL 2 347 movzx vald, byte %2 348 imul vald, 0x01010101 349%if %1 >= 8 350 movd m0, vald 351%if mmsize == 16 352 pshufd m0, m0, q0000 353%else 354 punpckldq m0, m0 355%endif ; mmsize == 16 356%endif ; %1 > 16 357%endmacro ; READ_V_PIXEL 358 359%macro WRITE_V_PIXEL 2 360%assign %%off 0 361 362%if %1 >= 8 363 364%rep %1/mmsize 365 movu [%2+%%off], m0 366%assign %%off %%off+mmsize 367%endrep ; %1/mmsize 368 369%if mmsize == 16 370%if %1-%%off >= 8 371%if %1 > 16 && %1-%%off > 8 372 movu [%2+%1-16], m0 373%assign %%off %1 374%else 375 movq [%2+%%off], m0 376%assign %%off %%off+8 377%endif 378%endif ; %1-%%off >= 8 379%endif ; mmsize == 16 380 381%if %1-%%off >= 4 382%if %1 > 8 && %1-%%off > 4 383 movq [%2+%1-8], m0 384%assign %%off %1 385%else 386 movd [%2+%%off], m0 387%assign %%off %%off+4 388%endif 389%endif ; %1-%%off >= 4 390 391%else ; %1 < 8 392 393%rep %1/4 394 mov [%2+%%off], vald 395%assign %%off %%off+4 396%endrep ; %1/4 397 398%endif ; %1 >=/< 8 399 400%if %1-%%off == 2 401 mov [%2+%%off], valw 402%endif ; (%1-%%off)/2 403%endmacro ; WRITE_V_PIXEL 404 405%macro H_EXTEND 2 406%assign %%n %1 407%rep 1+(%2-%1)/2 408cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val 409.loop_y: ; do { 410 READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) 411 WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) 412 add dstq, dst_strideq ; dst += dst_stride 413 dec bhq ; } while (--bh) 414 jnz .loop_y 415 RET 416%assign %%n %%n+2 417%endrep ; 1+(%2-%1)/2 418%endmacro ; H_EXTEND 419 420INIT_MMX mmx 421H_EXTEND 2, 14 422%if ARCH_X86_32 423H_EXTEND 16, 22 424%endif 425 426INIT_XMM sse2 427H_EXTEND 16, 22 428 429%macro PREFETCH_FN 1 430cglobal prefetch, 3, 3, 0, buf, stride, h 431.loop: 432 %1 [bufq] 433 add bufq, strideq 434 dec hd 435 jg .loop 436 REP_RET 437%endmacro 438 439INIT_MMX mmxext 440PREFETCH_FN prefetcht0 441%if ARCH_X86_32 442INIT_MMX 3dnow 443PREFETCH_FN prefetch 444%endif 445