1;****************************************************************************** 2;* x86-optimized vertical line scaling functions 3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> 4;* Kieran Kunhya <kieran@kunhya.com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27minshort: times 8 dw 0x8000 28yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000 29yuv2yuvX_10_start: times 4 dd 0x10000 30yuv2yuvX_9_start: times 4 dd 0x20000 31yuv2yuvX_10_upper: times 8 dw 0x3ff 32yuv2yuvX_9_upper: times 8 dw 0x1ff 33pd_4: times 4 dd 4 34pd_4min0x40000:times 4 dd 4 - (0x40000) 35pw_16: times 8 dw 16 36pw_32: times 8 dw 32 37pw_512: times 8 dw 512 38pw_1024: times 8 dw 1024 39 40SECTION .text 41 42;----------------------------------------------------------------------------- 43; vertical line scaling 44; 45; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW, 46; const uint8_t *dither, int offset) 47; and 48; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize, 49; const int16_t **src, uint8_t *dst, int dstW, 50; const uint8_t *dither, int offset) 51; 52; Scale one or $filterSize lines of source data to generate one line of output 53; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in 54; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple 55; of 2. $offset is either 0 or 3. $dither holds 8 values. 56;----------------------------------------------------------------------------- 57 58%macro yuv2planeX_fn 3 59 60%if ARCH_X86_32 61%define cntr_reg fltsizeq 62%define movsx mov 63%else 64%define cntr_reg r7 65%define movsx movsxd 66%endif 67 68cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset 69%if %1 == 8 || %1 == 9 || %1 == 10 70 pxor m6, m6 71%endif ; %1 == 8/9/10 72 73%if %1 == 8 74%if ARCH_X86_32 75%assign pad 0x2c - (stack_offset & 15) 76 SUB rsp, pad 77%define m_dith m7 78%else ; x86-64 79%define m_dith m9 80%endif ; x86-32 81 82 ; create registers holding dither 83 movq m_dith, [ditherq] ; dither 84 test offsetd, offsetd 85 jz .no_rot 86%if mmsize == 16 87 punpcklqdq m_dith, m_dith 88%endif ; mmsize == 16 89 PALIGNR m_dith, m_dith, 3, m0 90.no_rot: 91%if mmsize == 16 92 punpcklbw m_dith, m6 93%if ARCH_X86_64 94 punpcklwd m8, m_dith, m6 95 pslld m8, 12 96%else ; x86-32 97 punpcklwd m5, m_dith, m6 98 pslld m5, 12 99%endif ; x86-32/64 100 punpckhwd m_dith, m6 101 pslld m_dith, 12 102%if ARCH_X86_32 103 mova [rsp+ 0], m5 104 mova [rsp+16], m_dith 105%endif 106%else ; mmsize == 8 107 punpcklbw m5, m_dith, m6 108 punpckhbw m_dith, m6 109 punpcklwd m4, m5, m6 110 punpckhwd m5, m6 111 punpcklwd m3, m_dith, m6 112 punpckhwd m_dith, m6 113 pslld m4, 12 114 pslld m5, 12 115 pslld m3, 12 116 pslld m_dith, 12 117 mova [rsp+ 0], m4 118 mova [rsp+ 8], m5 119 mova [rsp+16], m3 120 mova [rsp+24], m_dith 121%endif ; mmsize == 8/16 122%endif ; %1 == 8 123 124 xor r5, r5 125 126.pixelloop: 127%assign %%i 0 128 ; the rep here is for the 8bit output mmx case, where dither covers 129 ; 8 pixels but we can only handle 2 pixels per register, and thus 4 130 ; pixels per iteration. In order to not have to keep track of where 131 ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2. 132%if %1 == 8 133%assign %%repcnt 16/mmsize 134%else 135%assign %%repcnt 1 136%endif 137 138%rep %%repcnt 139 140%if %1 == 8 141%if ARCH_X86_32 142 mova m2, [rsp+mmsize*(0+%%i)] 143 mova m1, [rsp+mmsize*(1+%%i)] 144%else ; x86-64 145 mova m2, m8 146 mova m1, m_dith 147%endif ; x86-32/64 148%else ; %1 == 9/10/16 149 mova m1, [yuv2yuvX_%1_start] 150 mova m2, m1 151%endif ; %1 == 8/9/10/16 152 movsx cntr_reg, fltsizem 153.filterloop_ %+ %%i: 154 ; input pixels 155 mov r6, [srcq+gprsize*cntr_reg-2*gprsize] 156%if %1 == 16 157 mova m3, [r6+r5*4] 158 mova m5, [r6+r5*4+mmsize] 159%else ; %1 == 8/9/10 160 mova m3, [r6+r5*2] 161%endif ; %1 == 8/9/10/16 162 mov r6, [srcq+gprsize*cntr_reg-gprsize] 163%if %1 == 16 164 mova m4, [r6+r5*4] 165 mova m6, [r6+r5*4+mmsize] 166%else ; %1 == 8/9/10 167 mova m4, [r6+r5*2] 168%endif ; %1 == 8/9/10/16 169 170 ; coefficients 171 movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1] 172%if %1 == 16 173 pshuflw m7, m0, 0 ; coeff[0] 174 pshuflw m0, m0, 0x55 ; coeff[1] 175 pmovsxwd m7, m7 ; word -> dword 176 pmovsxwd m0, m0 ; word -> dword 177 178 pmulld m3, m7 179 pmulld m5, m7 180 pmulld m4, m0 181 pmulld m6, m0 182 183 paddd m2, m3 184 paddd m1, m5 185 paddd m2, m4 186 paddd m1, m6 187%else ; %1 == 10/9/8 188 punpcklwd m5, m3, m4 189 punpckhwd m3, m4 190 SPLATD m0 191 192 pmaddwd m5, m0 193 pmaddwd m3, m0 194 195 paddd m2, m5 196 paddd m1, m3 197%endif ; %1 == 8/9/10/16 198 199 sub cntr_reg, 2 200 jg .filterloop_ %+ %%i 201 202%if %1 == 16 203 psrad m2, 31 - %1 204 psrad m1, 31 - %1 205%else ; %1 == 10/9/8 206 psrad m2, 27 - %1 207 psrad m1, 27 - %1 208%endif ; %1 == 8/9/10/16 209 210%if %1 == 8 211 packssdw m2, m1 212 packuswb m2, m2 213 movh [dstq+r5*1], m2 214%else ; %1 == 9/10/16 215%if %1 == 16 216 packssdw m2, m1 217 paddw m2, [minshort] 218%else ; %1 == 9/10 219%if cpuflag(sse4) 220 packusdw m2, m1 221%else ; mmxext/sse2 222 packssdw m2, m1 223 pmaxsw m2, m6 224%endif ; mmxext/sse2/sse4/avx 225 pminsw m2, [yuv2yuvX_%1_upper] 226%endif ; %1 == 9/10/16 227 mova [dstq+r5*2], m2 228%endif ; %1 == 8/9/10/16 229 230 add r5, mmsize/2 231 sub wd, mmsize/2 232 233%assign %%i %%i+2 234%endrep 235 jg .pixelloop 236 237%if %1 == 8 238%if ARCH_X86_32 239 ADD rsp, pad 240 RET 241%else ; x86-64 242 REP_RET 243%endif ; x86-32/64 244%else ; %1 == 9/10/16 245 REP_RET 246%endif ; %1 == 8/9/10/16 247%endmacro 248 249%if ARCH_X86_32 250INIT_MMX mmxext 251yuv2planeX_fn 8, 0, 7 252yuv2planeX_fn 9, 0, 5 253yuv2planeX_fn 10, 0, 5 254%endif 255 256INIT_XMM sse2 257yuv2planeX_fn 8, 10, 7 258yuv2planeX_fn 9, 7, 5 259yuv2planeX_fn 10, 7, 5 260 261INIT_XMM sse4 262yuv2planeX_fn 8, 10, 7 263yuv2planeX_fn 9, 7, 5 264yuv2planeX_fn 10, 7, 5 265yuv2planeX_fn 16, 8, 5 266 267%if HAVE_AVX_EXTERNAL 268INIT_XMM avx 269yuv2planeX_fn 8, 10, 7 270yuv2planeX_fn 9, 7, 5 271yuv2planeX_fn 10, 7, 5 272%endif 273 274; %1=outout-bpc, %2=alignment (u/a) 275%macro yuv2plane1_mainloop 2 276.loop_%2: 277%if %1 == 8 278 paddsw m0, m2, [srcq+wq*2+mmsize*0] 279 paddsw m1, m3, [srcq+wq*2+mmsize*1] 280 psraw m0, 7 281 psraw m1, 7 282 packuswb m0, m1 283 mov%2 [dstq+wq], m0 284%elif %1 == 16 285 paddd m0, m4, [srcq+wq*4+mmsize*0] 286 paddd m1, m4, [srcq+wq*4+mmsize*1] 287 paddd m2, m4, [srcq+wq*4+mmsize*2] 288 paddd m3, m4, [srcq+wq*4+mmsize*3] 289 psrad m0, 3 290 psrad m1, 3 291 psrad m2, 3 292 psrad m3, 3 293%if cpuflag(sse4) ; avx/sse4 294 packusdw m0, m1 295 packusdw m2, m3 296%else ; mmx/sse2 297 packssdw m0, m1 298 packssdw m2, m3 299 paddw m0, m5 300 paddw m2, m5 301%endif ; mmx/sse2/sse4/avx 302 mov%2 [dstq+wq*2+mmsize*0], m0 303 mov%2 [dstq+wq*2+mmsize*1], m2 304%else ; %1 == 9/10 305 paddsw m0, m2, [srcq+wq*2+mmsize*0] 306 paddsw m1, m2, [srcq+wq*2+mmsize*1] 307 psraw m0, 15 - %1 308 psraw m1, 15 - %1 309 pmaxsw m0, m4 310 pmaxsw m1, m4 311 pminsw m0, m3 312 pminsw m1, m3 313 mov%2 [dstq+wq*2+mmsize*0], m0 314 mov%2 [dstq+wq*2+mmsize*1], m1 315%endif 316 add wq, mmsize 317 jl .loop_%2 318%endmacro 319 320%macro yuv2plane1_fn 3 321cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset 322 movsxdifnidn wq, wd 323 add wq, mmsize - 1 324 and wq, ~(mmsize - 1) 325%if %1 == 8 326 add dstq, wq 327%else ; %1 != 8 328 lea dstq, [dstq+wq*2] 329%endif ; %1 == 8 330%if %1 == 16 331 lea srcq, [srcq+wq*4] 332%else ; %1 != 16 333 lea srcq, [srcq+wq*2] 334%endif ; %1 == 16 335 neg wq 336 337%if %1 == 8 338 pxor m4, m4 ; zero 339 340 ; create registers holding dither 341 movq m3, [ditherq] ; dither 342 test offsetd, offsetd 343 jz .no_rot 344%if mmsize == 16 345 punpcklqdq m3, m3 346%endif ; mmsize == 16 347 PALIGNR m3, m3, 3, m2 348.no_rot: 349%if mmsize == 8 350 mova m2, m3 351 punpckhbw m3, m4 ; byte->word 352 punpcklbw m2, m4 ; byte->word 353%else 354 punpcklbw m3, m4 355 mova m2, m3 356%endif 357%elif %1 == 9 358 pxor m4, m4 359 mova m3, [pw_512] 360 mova m2, [pw_32] 361%elif %1 == 10 362 pxor m4, m4 363 mova m3, [pw_1024] 364 mova m2, [pw_16] 365%else ; %1 == 16 366%if cpuflag(sse4) ; sse4/avx 367 mova m4, [pd_4] 368%else ; mmx/sse2 369 mova m4, [pd_4min0x40000] 370 mova m5, [minshort] 371%endif ; mmx/sse2/sse4/avx 372%endif ; %1 == .. 373 374 ; actual pixel scaling 375%if mmsize == 8 376 yuv2plane1_mainloop %1, a 377%else ; mmsize == 16 378 test dstq, 15 379 jnz .unaligned 380 yuv2plane1_mainloop %1, a 381 REP_RET 382.unaligned: 383 yuv2plane1_mainloop %1, u 384%endif ; mmsize == 8/16 385 REP_RET 386%endmacro 387 388%if ARCH_X86_32 389INIT_MMX mmx 390yuv2plane1_fn 8, 0, 5 391yuv2plane1_fn 16, 0, 3 392 393INIT_MMX mmxext 394yuv2plane1_fn 9, 0, 3 395yuv2plane1_fn 10, 0, 3 396%endif 397 398INIT_XMM sse2 399yuv2plane1_fn 8, 5, 5 400yuv2plane1_fn 9, 5, 3 401yuv2plane1_fn 10, 5, 3 402yuv2plane1_fn 16, 6, 3 403 404INIT_XMM sse4 405yuv2plane1_fn 16, 5, 3 406 407%if HAVE_AVX_EXTERNAL 408INIT_XMM avx 409yuv2plane1_fn 8, 5, 5 410yuv2plane1_fn 9, 5, 3 411yuv2plane1_fn 10, 5, 3 412yuv2plane1_fn 16, 5, 3 413%endif 414