1;****************************************************************************** 2;* x86-optimized vertical line scaling functions 3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> 4;* Kieran Kunhya <kieran@kunhya.com> 5;* 6;* This file is part of Libav. 7;* 8;* Libav is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* Libav is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with Libav; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "x86inc.asm" 24%include "x86util.asm" 25 26SECTION_RODATA 27 28minshort: times 8 dw 0x8000 29yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000 30yuv2yuvX_10_start: times 4 dd 0x10000 31yuv2yuvX_9_start: times 4 dd 0x20000 32yuv2yuvX_10_upper: times 8 dw 0x3ff 33yuv2yuvX_9_upper: times 8 dw 0x1ff 34pd_4: times 4 dd 4 35pd_4min0x40000:times 4 dd 4 - (0x40000) 36pw_16: times 8 dw 16 37pw_32: times 8 dw 32 38pw_512: times 8 dw 512 39pw_1024: times 8 dw 1024 40 41SECTION .text 42 43;----------------------------------------------------------------------------- 44; vertical line scaling 45; 46; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW, 47; const uint8_t *dither, int offset) 48; and 49; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize, 50; const int16_t **src, uint8_t *dst, int dstW, 51; const uint8_t *dither, int offset) 52; 53; Scale one or $filterSize lines of source data to generate one line of output 54; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in 55; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple 56; of 2. $offset is either 0 or 3. $dither holds 8 values. 57;----------------------------------------------------------------------------- 58 59%macro yuv2planeX_fn 3 60 61%ifdef ARCH_X86_32 62%define cntr_reg r1 63%define movsx mov 64%else 65%define cntr_reg r11 66%define movsx movsxd 67%endif 68 69cglobal yuv2planeX_%1, %3, 7, %2 70%if %1 == 8 || %1 == 9 || %1 == 10 71 pxor m6, m6 72%endif ; %1 == 8/9/10 73 74%if %1 == 8 75%ifdef ARCH_X86_32 76%assign pad 0x2c - (stack_offset & 15) 77 SUB rsp, pad 78%define m_dith m7 79%else ; x86-64 80%define m_dith m9 81%endif ; x86-32 82 83 ; create registers holding dither 84 movq m_dith, [r5] ; dither 85 test r6d, r6d 86 jz .no_rot 87%if mmsize == 16 88 punpcklqdq m_dith, m_dith 89%endif ; mmsize == 16 90 PALIGNR m_dith, m_dith, 3, m0 91.no_rot: 92%if mmsize == 16 93 punpcklbw m_dith, m6 94%ifdef ARCH_X86_64 95 punpcklwd m8, m_dith, m6 96 pslld m8, 12 97%else ; x86-32 98 punpcklwd m5, m_dith, m6 99 pslld m5, 12 100%endif ; x86-32/64 101 punpckhwd m_dith, m6 102 pslld m_dith, 12 103%ifdef ARCH_X86_32 104 mova [rsp+ 0], m5 105 mova [rsp+16], m_dith 106%endif 107%else ; mmsize == 8 108 punpcklbw m5, m_dith, m6 109 punpckhbw m_dith, m6 110 punpcklwd m4, m5, m6 111 punpckhwd m5, m6 112 punpcklwd m3, m_dith, m6 113 punpckhwd m_dith, m6 114 pslld m4, 12 115 pslld m5, 12 116 pslld m3, 12 117 pslld m_dith, 12 118 mova [rsp+ 0], m4 119 mova [rsp+ 8], m5 120 mova [rsp+16], m3 121 mova [rsp+24], m_dith 122%endif ; mmsize == 8/16 123%endif ; %1 == 8 124 125 xor r5, r5 126 127.pixelloop: 128%assign %%i 0 129 ; the rep here is for the 8bit output mmx case, where dither covers 130 ; 8 pixels but we can only handle 2 pixels per register, and thus 4 131 ; pixels per iteration. In order to not have to keep track of where 132 ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2. 133%if %1 == 8 134%rep 16/mmsize 135%endif ; %1 == 8 136 137%if %1 == 8 138%ifdef ARCH_X86_32 139 mova m2, [rsp+mmsize*(0+%%i)] 140 mova m1, [rsp+mmsize*(1+%%i)] 141%else ; x86-64 142 mova m2, m8 143 mova m1, m_dith 144%endif ; x86-32/64 145%else ; %1 == 9/10/16 146 mova m1, [yuv2yuvX_%1_start] 147 mova m2, m1 148%endif ; %1 == 8/9/10/16 149 movsx cntr_reg, r1m 150.filterloop_ %+ %%i: 151 ; input pixels 152 mov r6, [r2+gprsize*cntr_reg-2*gprsize] 153%if %1 == 16 154 mova m3, [r6+r5*4] 155 mova m5, [r6+r5*4+mmsize] 156%else ; %1 == 8/9/10 157 mova m3, [r6+r5*2] 158%endif ; %1 == 8/9/10/16 159 mov r6, [r2+gprsize*cntr_reg-gprsize] 160%if %1 == 16 161 mova m4, [r6+r5*4] 162 mova m6, [r6+r5*4+mmsize] 163%else ; %1 == 8/9/10 164 mova m4, [r6+r5*2] 165%endif ; %1 == 8/9/10/16 166 167 ; coefficients 168 movd m0, [r0+2*cntr_reg-4]; coeff[0], coeff[1] 169%if %1 == 16 170 pshuflw m7, m0, 0 ; coeff[0] 171 pshuflw m0, m0, 0x55 ; coeff[1] 172 pmovsxwd m7, m7 ; word -> dword 173 pmovsxwd m0, m0 ; word -> dword 174 175 pmulld m3, m7 176 pmulld m5, m7 177 pmulld m4, m0 178 pmulld m6, m0 179 180 paddd m2, m3 181 paddd m1, m5 182 paddd m2, m4 183 paddd m1, m6 184%else ; %1 == 10/9/8 185 punpcklwd m5, m3, m4 186 punpckhwd m3, m4 187 SPLATD m0, m0 188 189 pmaddwd m5, m0 190 pmaddwd m3, m0 191 192 paddd m2, m5 193 paddd m1, m3 194%endif ; %1 == 8/9/10/16 195 196 sub cntr_reg, 2 197 jg .filterloop_ %+ %%i 198 199%if %1 == 16 200 psrad m2, 31 - %1 201 psrad m1, 31 - %1 202%else ; %1 == 10/9/8 203 psrad m2, 27 - %1 204 psrad m1, 27 - %1 205%endif ; %1 == 8/9/10/16 206 207%if %1 == 8 208 packssdw m2, m1 209 packuswb m2, m2 210 movh [r3+r5*1], m2 211%else ; %1 == 9/10/16 212%if %1 == 16 213 packssdw m2, m1 214 paddw m2, [minshort] 215%else ; %1 == 9/10 216%if cpuflag(sse4) 217 packusdw m2, m1 218%else ; mmx2/sse2 219 packssdw m2, m1 220 pmaxsw m2, m6 221%endif ; mmx2/sse2/sse4/avx 222 pminsw m2, [yuv2yuvX_%1_upper] 223%endif ; %1 == 9/10/16 224 mova [r3+r5*2], m2 225%endif ; %1 == 8/9/10/16 226 227 add r5, mmsize/2 228 sub r4d, mmsize/2 229%if %1 == 8 230%assign %%i %%i+2 231%endrep 232%endif ; %1 == 8 233 jg .pixelloop 234 235%if %1 == 8 236%ifdef ARCH_X86_32 237 ADD rsp, pad 238 RET 239%else ; x86-64 240 REP_RET 241%endif ; x86-32/64 242%else ; %1 == 9/10/16 243 REP_RET 244%endif ; %1 == 8/9/10/16 245%endmacro 246 247%define PALIGNR PALIGNR_MMX 248%ifdef ARCH_X86_32 249INIT_MMX mmx2 250yuv2planeX_fn 8, 0, 7 251yuv2planeX_fn 9, 0, 5 252yuv2planeX_fn 10, 0, 5 253%endif 254 255INIT_XMM sse2 256yuv2planeX_fn 8, 10, 7 257yuv2planeX_fn 9, 7, 5 258yuv2planeX_fn 10, 7, 5 259 260%define PALIGNR PALIGNR_SSSE3 261INIT_XMM sse4 262yuv2planeX_fn 8, 10, 7 263yuv2planeX_fn 9, 7, 5 264yuv2planeX_fn 10, 7, 5 265yuv2planeX_fn 16, 8, 5 266 267INIT_XMM avx 268yuv2planeX_fn 8, 10, 7 269yuv2planeX_fn 9, 7, 5 270yuv2planeX_fn 10, 7, 5 271 272; %1=outout-bpc, %2=alignment (u/a) 273%macro yuv2plane1_mainloop 2 274.loop_%2: 275%if %1 == 8 276 paddsw m0, m2, [r0+r2*2+mmsize*0] 277 paddsw m1, m3, [r0+r2*2+mmsize*1] 278 psraw m0, 7 279 psraw m1, 7 280 packuswb m0, m1 281 mov%2 [r1+r2], m0 282%elif %1 == 16 283 paddd m0, m4, [r0+r2*4+mmsize*0] 284 paddd m1, m4, [r0+r2*4+mmsize*1] 285 paddd m2, m4, [r0+r2*4+mmsize*2] 286 paddd m3, m4, [r0+r2*4+mmsize*3] 287 psrad m0, 3 288 psrad m1, 3 289 psrad m2, 3 290 psrad m3, 3 291%if cpuflag(sse4) ; avx/sse4 292 packusdw m0, m1 293 packusdw m2, m3 294%else ; mmx/sse2 295 packssdw m0, m1 296 packssdw m2, m3 297 paddw m0, m5 298 paddw m2, m5 299%endif ; mmx/sse2/sse4/avx 300 mov%2 [r1+r2*2], m0 301 mov%2 [r1+r2*2+mmsize], m2 302%else 303 paddsw m0, m2, [r0+r2*2+mmsize*0] 304 paddsw m1, m2, [r0+r2*2+mmsize*1] 305 psraw m0, 15 - %1 306 psraw m1, 15 - %1 307 pmaxsw m0, m4 308 pmaxsw m1, m4 309 pminsw m0, m3 310 pminsw m1, m3 311 mov%2 [r1+r2*2], m0 312 mov%2 [r1+r2*2+mmsize], m1 313%endif 314 add r2, mmsize 315 jl .loop_%2 316%endmacro 317 318%macro yuv2plane1_fn 3 319cglobal yuv2plane1_%1, %3, %3, %2 320 add r2, mmsize - 1 321 and r2, ~(mmsize - 1) 322%if %1 == 8 323 add r1, r2 324%else ; %1 != 8 325 lea r1, [r1+r2*2] 326%endif ; %1 == 8 327%if %1 == 16 328 lea r0, [r0+r2*4] 329%else ; %1 != 16 330 lea r0, [r0+r2*2] 331%endif ; %1 == 16 332 neg r2 333 334%if %1 == 8 335 pxor m4, m4 ; zero 336 337 ; create registers holding dither 338 movq m3, [r3] ; dither 339 test r4d, r4d 340 jz .no_rot 341%if mmsize == 16 342 punpcklqdq m3, m3 343%endif ; mmsize == 16 344 PALIGNR_MMX m3, m3, 3, m2 345.no_rot: 346%if mmsize == 8 347 mova m2, m3 348 punpckhbw m3, m4 ; byte->word 349 punpcklbw m2, m4 ; byte->word 350%else 351 punpcklbw m3, m4 352 mova m2, m3 353%endif 354%elif %1 == 9 355 pxor m4, m4 356 mova m3, [pw_512] 357 mova m2, [pw_32] 358%elif %1 == 10 359 pxor m4, m4 360 mova m3, [pw_1024] 361 mova m2, [pw_16] 362%else ; %1 == 16 363%if cpuflag(sse4) ; sse4/avx 364 mova m4, [pd_4] 365%else ; mmx/sse2 366 mova m4, [pd_4min0x40000] 367 mova m5, [minshort] 368%endif ; mmx/sse2/sse4/avx 369%endif ; %1 == .. 370 371 ; actual pixel scaling 372%if mmsize == 8 373 yuv2plane1_mainloop %1, a 374%else ; mmsize == 16 375 test r1, 15 376 jnz .unaligned 377 yuv2plane1_mainloop %1, a 378 REP_RET 379.unaligned: 380 yuv2plane1_mainloop %1, u 381%endif ; mmsize == 8/16 382 REP_RET 383%endmacro 384 385%ifdef ARCH_X86_32 386INIT_MMX mmx 387yuv2plane1_fn 8, 0, 5 388yuv2plane1_fn 16, 0, 3 389 390INIT_MMX mmx2 391yuv2plane1_fn 9, 0, 3 392yuv2plane1_fn 10, 0, 3 393%endif 394 395INIT_XMM sse2 396yuv2plane1_fn 8, 5, 5 397yuv2plane1_fn 9, 5, 3 398yuv2plane1_fn 10, 5, 3 399yuv2plane1_fn 16, 6, 3 400 401INIT_XMM sse4 402yuv2plane1_fn 16, 5, 3 403 404INIT_XMM avx 405yuv2plane1_fn 8, 5, 5 406yuv2plane1_fn 9, 5, 3 407yuv2plane1_fn 10, 5, 3 408yuv2plane1_fn 16, 5, 3 409