1;****************************************************************************** 2;* x86-optimized horizontal line scaling functions 3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> 4;* 5;* This file is part of Libav. 6;* 7;* Libav is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* Libav is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with Libav; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "x86inc.asm" 23%include "x86util.asm" 24 25SECTION_RODATA 26 27max_19bit_int: times 4 dd 0x7ffff 28max_19bit_flt: times 4 dd 524287.0 29minshort: times 8 dw 0x8000 30unicoeff: times 4 dd 0x20000000 31 32SECTION .text 33 34;----------------------------------------------------------------------------- 35; horizontal line scaling 36; 37; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt> 38; (SwsContext *c, int{16,32}_t *dst, 39; int dstW, const uint{8,16}_t *src, 40; const int16_t *filter, 41; const int32_t *filterPos, int filterSize); 42; 43; Scale one horizontal line. Input is either 8-bits width or 16-bits width 44; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to 45; downscale before multiplying). Filter is 14-bits. Output is either 15bits 46; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each 47; output pixel is generated from $filterSize input pixels, the position of 48; the first pixel is given in filterPos[nOutputPixel]. 49;----------------------------------------------------------------------------- 50 51; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, opt, n_args, n_xmm 52%macro SCALE_FUNC 7 53cglobal hscale%1to%2_%4_%5, %6, 7, %7 54%ifdef ARCH_X86_64 55 movsxd r2, r2d 56%define mov32 movsxd 57%else ; x86-32 58%define mov32 mov 59%endif ; x86-64 60%if %2 == 19 61%if mmsize == 8 ; mmx 62 mova m2, [max_19bit_int] 63%elifidn %5, sse4 64 mova m2, [max_19bit_int] 65%else ; ssse3/sse2 66 mova m2, [max_19bit_flt] 67%endif ; mmx/sse2/ssse3/sse4 68%endif ; %2 == 19 69%if %1 == 16 70 mova m6, [minshort] 71 mova m7, [unicoeff] 72%elif %1 == 8 73 pxor m3, m3 74%endif ; %1 == 8/16 75 76%if %1 == 8 77%define movlh movd 78%define movbh movh 79%define srcmul 1 80%else ; %1 == 9-16 81%define movlh movq 82%define movbh movu 83%define srcmul 2 84%endif ; %1 == 8/9-16 85 86%ifnidn %3, X 87 88 ; setup loop 89%if %3 == 8 90 shl r2, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter 91%define r2shr 1 92%else ; %3 == 4 93%define r2shr 0 94%endif ; %3 == 8 95 lea r4, [r4+r2*8] 96%if %2 == 15 97 lea r1, [r1+r2*(2>>r2shr)] 98%else ; %2 == 19 99 lea r1, [r1+r2*(4>>r2shr)] 100%endif ; %2 == 15/19 101 lea r5, [r5+r2*(4>>r2shr)] 102 neg r2 103 104.loop: 105%if %3 == 4 ; filterSize == 4 scaling 106 ; load 2x4 or 4x4 source pixels into m0/m1 107 mov32 r0, dword [r5+r2*4+0] ; filterPos[0] 108 mov32 r6, dword [r5+r2*4+4] ; filterPos[1] 109 movlh m0, [r3+r0*srcmul] ; src[filterPos[0] + {0,1,2,3}] 110%if mmsize == 8 111 movlh m1, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] 112%else ; mmsize == 16 113%if %1 > 8 114 movhps m0, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] 115%else ; %1 == 8 116 movd m4, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] 117%endif 118 mov32 r0, dword [r5+r2*4+8] ; filterPos[2] 119 mov32 r6, dword [r5+r2*4+12] ; filterPos[3] 120 movlh m1, [r3+r0*srcmul] ; src[filterPos[2] + {0,1,2,3}] 121%if %1 > 8 122 movhps m1, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}] 123%else ; %1 == 8 124 movd m5, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}] 125 punpckldq m0, m4 126 punpckldq m1, m5 127%endif ; %1 == 8 && %5 <= ssse 128%endif ; mmsize == 8/16 129%if %1 == 8 130 punpcklbw m0, m3 ; byte -> word 131 punpcklbw m1, m3 ; byte -> word 132%endif ; %1 == 8 133 134 ; multiply with filter coefficients 135%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll 136 ; add back 0x8000 * sum(coeffs) after the horizontal add 137 psubw m0, m6 138 psubw m1, m6 139%endif ; %1 == 16 140 pmaddwd m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}] 141 pmaddwd m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}] 142 143 ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix) 144%if mmsize == 8 ; mmx 145 movq m4, m0 146 punpckldq m0, m1 147 punpckhdq m4, m1 148 paddd m0, m4 149%elifidn %5, sse2 150 mova m4, m0 151 shufps m0, m1, 10001000b 152 shufps m4, m1, 11011101b 153 paddd m0, m4 154%else ; ssse3/sse4 155 phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}], 156 ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}], 157 ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}], 158 ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}] 159%endif ; mmx/sse2/ssse3/sse4 160%else ; %3 == 8, i.e. filterSize == 8 scaling 161 ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5 162 mov32 r0, dword [r5+r2*2+0] ; filterPos[0] 163 mov32 r6, dword [r5+r2*2+4] ; filterPos[1] 164 movbh m0, [r3+ r0 *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}] 165%if mmsize == 8 166 movbh m1, [r3+(r0+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}] 167 movbh m4, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3}] 168 movbh m5, [r3+(r6+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}] 169%else ; mmsize == 16 170 movbh m1, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}] 171 mov32 r0, dword [r5+r2*2+8] ; filterPos[2] 172 mov32 r6, dword [r5+r2*2+12] ; filterPos[3] 173 movbh m4, [r3+ r0 *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}] 174 movbh m5, [r3+ r6 *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}] 175%endif ; mmsize == 8/16 176%if %1 == 8 177 punpcklbw m0, m3 ; byte -> word 178 punpcklbw m1, m3 ; byte -> word 179 punpcklbw m4, m3 ; byte -> word 180 punpcklbw m5, m3 ; byte -> word 181%endif ; %1 == 8 182 183 ; multiply 184%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll 185 ; add back 0x8000 * sum(coeffs) after the horizontal add 186 psubw m0, m6 187 psubw m1, m6 188 psubw m4, m6 189 psubw m5, m6 190%endif ; %1 == 16 191 pmaddwd m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}] 192 pmaddwd m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}] 193 pmaddwd m4, [r4+r2*8+mmsize*2] ; *= filter[{16,17,..,22,23}] 194 pmaddwd m5, [r4+r2*8+mmsize*3] ; *= filter[{24,25,..,30,31}] 195 196 ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix) 197%if mmsize == 8 198 paddd m0, m1 199 paddd m4, m5 200 movq m1, m0 201 punpckldq m0, m4 202 punpckhdq m1, m4 203 paddd m0, m1 204%elifidn %5, sse2 205%if %1 == 8 206%define mex m6 207%else 208%define mex m3 209%endif 210 ; emulate horizontal add as transpose + vertical add 211 mova mex, m0 212 punpckldq m0, m1 213 punpckhdq mex, m1 214 paddd m0, mex 215 mova m1, m4 216 punpckldq m4, m5 217 punpckhdq m1, m5 218 paddd m4, m1 219 mova m1, m0 220 punpcklqdq m0, m4 221 punpckhqdq m1, m4 222 paddd m0, m1 223%else ; ssse3/sse4 224 ; FIXME if we rearrange the filter in pairs of 4, we can 225 ; load pixels likewise and use 2 x paddd + phaddd instead 226 ; of 3 x phaddd here, faster on older cpus 227 phaddd m0, m1 228 phaddd m4, m5 229 phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}], 230 ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}], 231 ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}], 232 ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}] 233%endif ; mmx/sse2/ssse3/sse4 234%endif ; %3 == 4/8 235 236%else ; %3 == X, i.e. any filterSize scaling 237 238%ifidn %4, X4 239%define r6sub 4 240%else ; %4 == X || %4 == X8 241%define r6sub 0 242%endif ; %4 ==/!= X4 243%ifdef ARCH_X86_64 244 push r12 245 movsxd r6, r6d ; filterSize 246 lea r12, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4] 247%define src_reg r11 248%define r1x r10 249%define filter2 r12 250%else ; x86-32 251 lea r0, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4] 252 mov r6m, r0 253%define src_reg r3 254%define r1x r1 255%define filter2 r6m 256%endif ; x86-32/64 257 lea r5, [r5+r2*4] 258%if %2 == 15 259 lea r1, [r1+r2*2] 260%else ; %2 == 19 261 lea r1, [r1+r2*4] 262%endif ; %2 == 15/19 263 movifnidn r1mp, r1 264 neg r2 265 266.loop: 267 mov32 r0, dword [r5+r2*4+0] ; filterPos[0] 268 mov32 r1x, dword [r5+r2*4+4] ; filterPos[1] 269 ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)? 270 pxor m4, m4 271 pxor m5, m5 272 mov src_reg, r3mp 273 274.innerloop: 275 ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5 276 movbh m0, [src_reg+r0 *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}] 277 movbh m1, [src_reg+(r1x+r6sub)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}] 278%if %1 == 8 279 punpcklbw m0, m3 280 punpcklbw m1, m3 281%endif ; %1 == 8 282 283 ; multiply 284%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll 285 ; add back 0x8000 * sum(coeffs) after the horizontal add 286 psubw m0, m6 287 psubw m1, m6 288%endif ; %1 == 16 289 pmaddwd m0, [r4 ] ; filter[{0,1,2,3(,4,5,6,7)}] 290 pmaddwd m1, [r4+(r6+r6sub)*2] ; filter[filtersize+{0,1,2,3(,4,5,6,7)}] 291 paddd m4, m0 292 paddd m5, m1 293 add r4, mmsize 294 add src_reg, srcmul*mmsize/2 295 cmp src_reg, filter2 ; while (src += 4) < &src[filterSize] 296 jl .innerloop 297 298%ifidn %4, X4 299 mov32 r1x, dword [r5+r2*4+4] ; filterPos[1] 300 movlh m0, [src_reg+r0 *srcmul] ; split last 4 srcpx of dstpx[0] 301 sub r1x, r6 ; and first 4 srcpx of dstpx[1] 302%if %1 > 8 303 movhps m0, [src_reg+(r1x+r6sub)*srcmul] 304%else ; %1 == 8 305 movd m1, [src_reg+(r1x+r6sub)*srcmul] 306 punpckldq m0, m1 307%endif ; %1 == 8 && %5 <= ssse 308%if %1 == 8 309 punpcklbw m0, m3 310%endif ; %1 == 8 311%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll 312 ; add back 0x8000 * sum(coeffs) after the horizontal add 313 psubw m0, m6 314%endif ; %1 == 16 315 pmaddwd m0, [r4] 316%endif ; %4 == X4 317 318 lea r4, [r4+(r6+r6sub)*2] 319 320%if mmsize == 8 ; mmx 321 movq m0, m4 322 punpckldq m4, m5 323 punpckhdq m0, m5 324 paddd m0, m4 325%else ; mmsize == 16 326%ifidn %5, sse2 327 mova m1, m4 328 punpcklqdq m4, m5 329 punpckhqdq m1, m5 330 paddd m4, m1 331%else ; ssse3/sse4 332 phaddd m4, m5 333%endif ; sse2/ssse3/sse4 334%ifidn %4, X4 335 paddd m4, m0 336%endif ; %3 == X4 337%ifidn %5, sse2 338 pshufd m4, m4, 11011000b 339 movhlps m0, m4 340 paddd m0, m4 341%else ; ssse3/sse4 342 phaddd m4, m4 343 SWAP 0, 4 344%endif ; sse2/ssse3/sse4 345%endif ; mmsize == 8/16 346%endif ; %3 ==/!= X 347 348%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned 349 paddd m0, m7 350%endif ; %1 == 16 351 352 ; clip, store 353 psrad m0, 14 + %1 - %2 354%ifidn %3, X 355 movifnidn r1, r1mp 356%endif ; %3 == X 357%if %2 == 15 358 packssdw m0, m0 359%ifnidn %3, X 360 movh [r1+r2*(2>>r2shr)], m0 361%else ; %3 == X 362 movd [r1+r2*2], m0 363%endif ; %3 ==/!= X 364%else ; %2 == 19 365%if mmsize == 8 366 PMINSD_MMX m0, m2, m4 367%elifidn %5, sse4 368 pminsd m0, m2 369%else ; sse2/ssse3 370 cvtdq2ps m0, m0 371 minps m0, m2 372 cvtps2dq m0, m0 373%endif ; mmx/sse2/ssse3/sse4 374%ifnidn %3, X 375 mova [r1+r2*(4>>r2shr)], m0 376%else ; %3 == X 377 movq [r1+r2*4], m0 378%endif ; %3 ==/!= X 379%endif ; %2 == 15/19 380%ifnidn %3, X 381 add r2, (mmsize<<r2shr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels) 382 ; per iteration. see "shl r2,1" above as for why we do this 383%else ; %3 == X 384 add r2, 2 385%endif ; %3 ==/!= X 386 jl .loop 387%ifnidn %3, X 388 REP_RET 389%else ; %3 == X 390%ifdef ARCH_X86_64 391 pop r12 392 RET 393%else ; x86-32 394 REP_RET 395%endif ; x86-32/64 396%endif ; %3 ==/!= X 397%endmacro 398 399; SCALE_FUNCS source_width, intermediate_nbits, opt, n_xmm 400%macro SCALE_FUNCS 4 401SCALE_FUNC %1, %2, 4, 4, %3, 6, %4 402SCALE_FUNC %1, %2, 8, 8, %3, 6, %4 403%if mmsize == 8 404SCALE_FUNC %1, %2, X, X, %3, 7, %4 405%else 406SCALE_FUNC %1, %2, X, X4, %3, 7, %4 407SCALE_FUNC %1, %2, X, X8, %3, 7, %4 408%endif 409%endmacro 410 411; SCALE_FUNCS2 opt, 8_xmm_args, 9to10_xmm_args, 16_xmm_args 412%macro SCALE_FUNCS2 4 413%ifnidn %1, sse4 414SCALE_FUNCS 8, 15, %1, %2 415SCALE_FUNCS 9, 15, %1, %3 416SCALE_FUNCS 10, 15, %1, %3 417SCALE_FUNCS 16, 15, %1, %4 418%endif ; !sse4 419SCALE_FUNCS 8, 19, %1, %2 420SCALE_FUNCS 9, 19, %1, %3 421SCALE_FUNCS 10, 19, %1, %3 422SCALE_FUNCS 16, 19, %1, %4 423%endmacro 424 425%ifdef ARCH_X86_32 426INIT_MMX 427SCALE_FUNCS2 mmx, 0, 0, 0 428%endif 429INIT_XMM 430SCALE_FUNCS2 sse2, 6, 7, 8 431SCALE_FUNCS2 ssse3, 6, 6, 8 432SCALE_FUNCS2 sse4, 6, 6, 8 433