1;****************************************************************************** 2;* MMX/SSE2-optimized functions for the RV40 decoder 3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> 4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> 5;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28align 16 29pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024 30 31sixtap_filter_hb_m: times 8 db 1, -5 32 times 8 db 52, 20 33 ; multiplied by 2 to have the same shift 34 times 8 db 2, -10 35 times 8 db 40, 40 36 ; back to normal 37 times 8 db 1, -5 38 times 8 db 20, 52 39 40sixtap_filter_v_m: times 8 dw 1 41 times 8 dw -5 42 times 8 dw 52 43 times 8 dw 20 44 ; multiplied by 2 to have the same shift 45 times 8 dw 2 46 times 8 dw -10 47 times 8 dw 40 48 times 8 dw 40 49 ; back to normal 50 times 8 dw 1 51 times 8 dw -5 52 times 8 dw 20 53 times 8 dw 52 54 55%ifdef PIC 56%define sixtap_filter_hw picregq 57%define sixtap_filter_hb picregq 58%define sixtap_filter_v picregq 59%define npicregs 1 60%else 61%define sixtap_filter_hw sixtap_filter_hw_m 62%define sixtap_filter_hb sixtap_filter_hb_m 63%define sixtap_filter_v sixtap_filter_v_m 64%define npicregs 0 65%endif 66 67filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 68filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 69filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11 70 71cextern pw_32 72cextern pw_16 73cextern pw_512 74 75SECTION .text 76 77;----------------------------------------------------------------------------- 78; subpel MC functions: 79; 80; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride, 81; uint8_t *src, int srcstride, 82; int len, int m); 83;---------------------------------------------------------------------- 84%macro LOAD 2 85%if WIN64 86 movsxd %1q, %1d 87%endif 88%ifdef PIC 89 add %1q, picregq 90%else 91 add %1q, %2 92%endif 93%endmacro 94 95%macro STORE 3 96%ifidn %3, avg 97 movh %2, [dstq] 98%endif 99 packuswb %1, %1 100%ifidn %3, avg 101 PAVGB %1, %2 102%endif 103 movh [dstq], %1 104%endmacro 105 106%macro FILTER_V 1 107cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg 108%ifdef PIC 109 lea picregq, [sixtap_filter_v_m] 110%endif 111 pxor m7, m7 112 LOAD my, sixtap_filter_v 113 114 ; read 5 lines 115 sub srcq, srcstrideq 116 sub srcq, srcstrideq 117 movh m0, [srcq] 118 movh m1, [srcq+srcstrideq] 119 movh m2, [srcq+srcstrideq*2] 120 lea srcq, [srcq+srcstrideq*2] 121 add srcq, srcstrideq 122 movh m3, [srcq] 123 movh m4, [srcq+srcstrideq] 124 punpcklbw m0, m7 125 punpcklbw m1, m7 126 punpcklbw m2, m7 127 punpcklbw m3, m7 128 punpcklbw m4, m7 129 130%ifdef m8 131 mova m8, [myq+ 0] 132 mova m9, [myq+16] 133 mova m10, [myq+32] 134 mova m11, [myq+48] 135%define COEFF05 m8 136%define COEFF14 m9 137%define COEFF2 m10 138%define COEFF3 m11 139%else 140%define COEFF05 [myq+ 0] 141%define COEFF14 [myq+16] 142%define COEFF2 [myq+32] 143%define COEFF3 [myq+48] 144%endif 145.nextrow: 146 mova m6, m1 147 movh m5, [srcq+2*srcstrideq] ; read new row 148 paddw m6, m4 149 punpcklbw m5, m7 150 pmullw m6, COEFF14 151 paddw m0, m5 152 pmullw m0, COEFF05 153 paddw m6, m0 154 mova m0, m1 155 paddw m6, [pw_32] 156 mova m1, m2 157 pmullw m2, COEFF2 158 paddw m6, m2 159 mova m2, m3 160 pmullw m3, COEFF3 161 paddw m6, m3 162 163 ; round/clip/store 164 mova m3, m4 165 psraw m6, 6 166 mova m4, m5 167 STORE m6, m5, %1 168 169 ; go to next line 170 add dstq, dststrideq 171 add srcq, srcstrideq 172 dec heightd ; next row 173 jg .nextrow 174 REP_RET 175%endmacro 176 177%macro FILTER_H 1 178cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg 179%ifdef PIC 180 lea picregq, [sixtap_filter_v_m] 181%endif 182 pxor m7, m7 183 LOAD mx, sixtap_filter_v 184 mova m6, [pw_32] 185%ifdef m8 186 mova m8, [mxq+ 0] 187 mova m9, [mxq+16] 188 mova m10, [mxq+32] 189 mova m11, [mxq+48] 190%define COEFF05 m8 191%define COEFF14 m9 192%define COEFF2 m10 193%define COEFF3 m11 194%else 195%define COEFF05 [mxq+ 0] 196%define COEFF14 [mxq+16] 197%define COEFF2 [mxq+32] 198%define COEFF3 [mxq+48] 199%endif 200.nextrow: 201 movq m0, [srcq-2] 202 movq m5, [srcq+3] 203 movq m1, [srcq-1] 204 movq m4, [srcq+2] 205 punpcklbw m0, m7 206 punpcklbw m5, m7 207 punpcklbw m1, m7 208 punpcklbw m4, m7 209 movq m2, [srcq-0] 210 movq m3, [srcq+1] 211 paddw m0, m5 212 paddw m1, m4 213 punpcklbw m2, m7 214 punpcklbw m3, m7 215 pmullw m0, COEFF05 216 pmullw m1, COEFF14 217 pmullw m2, COEFF2 218 pmullw m3, COEFF3 219 paddw m0, m6 220 paddw m1, m2 221 paddw m0, m3 222 paddw m0, m1 223 psraw m0, 6 224 STORE m0, m1, %1 225 226 ; go to next line 227 add dstq, dststrideq 228 add srcq, srcstrideq 229 dec heightd ; next row 230 jg .nextrow 231 REP_RET 232%endmacro 233 234%if ARCH_X86_32 235INIT_MMX mmx 236FILTER_V put 237FILTER_H put 238 239INIT_MMX mmxext 240FILTER_V avg 241FILTER_H avg 242 243INIT_MMX 3dnow 244FILTER_V avg 245FILTER_H avg 246%endif 247 248INIT_XMM sse2 249FILTER_H put 250FILTER_H avg 251FILTER_V put 252FILTER_V avg 253 254%macro FILTER_SSSE3 1 255cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg 256%ifdef PIC 257 lea picregq, [sixtap_filter_hb_m] 258%endif 259 260 ; read 5 lines 261 sub srcq, srcstrideq 262 LOAD my, sixtap_filter_hb 263 sub srcq, srcstrideq 264 movh m0, [srcq] 265 movh m1, [srcq+srcstrideq] 266 movh m2, [srcq+srcstrideq*2] 267 lea srcq, [srcq+srcstrideq*2] 268 add srcq, srcstrideq 269 mova m5, [myq] 270 movh m3, [srcq] 271 movh m4, [srcq+srcstrideq] 272 lea srcq, [srcq+2*srcstrideq] 273 274.nextrow: 275 mova m6, m2 276 punpcklbw m0, m1 277 punpcklbw m6, m3 278 pmaddubsw m0, m5 279 pmaddubsw m6, [myq+16] 280 movh m7, [srcq] ; read new row 281 paddw m6, m0 282 mova m0, m1 283 mova m1, m2 284 mova m2, m3 285 mova m3, m4 286 mova m4, m7 287 punpcklbw m7, m3 288 pmaddubsw m7, m5 289 paddw m6, m7 290 pmulhrsw m6, [pw_512] 291 STORE m6, m7, %1 292 293 ; go to next line 294 add dstq, dststrideq 295 add srcq, srcstrideq 296 dec heightd ; next row 297 jg .nextrow 298 REP_RET 299 300cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg 301%ifdef PIC 302 lea picregq, [sixtap_filter_hb_m] 303%endif 304 mova m3, [filter_h6_shuf2] 305 mova m4, [filter_h6_shuf3] 306 LOAD mx, sixtap_filter_hb 307 mova m5, [mxq] ; set up 6tap filter in bytes 308 mova m6, [mxq+16] 309 mova m7, [filter_h6_shuf1] 310 311.nextrow: 312 movu m0, [srcq-2] 313 mova m1, m0 314 mova m2, m0 315 pshufb m0, m7 316 pshufb m1, m3 317 pshufb m2, m4 318 pmaddubsw m0, m5 319 pmaddubsw m1, m6 320 pmaddubsw m2, m5 321 paddw m0, m1 322 paddw m0, m2 323 pmulhrsw m0, [pw_512] 324 STORE m0, m1, %1 325 326 ; go to next line 327 add dstq, dststrideq 328 add srcq, srcstrideq 329 dec heightd ; next row 330 jg .nextrow 331 REP_RET 332%endmacro 333 334INIT_XMM ssse3 335FILTER_SSSE3 put 336FILTER_SSSE3 avg 337 338; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2 339%macro RV40_WCORE 4-5 340 movh m4, [%3 + r6 + 0] 341 movh m5, [%4 + r6 + 0] 342%if %0 == 4 343%define OFFSET r6 + mmsize / 2 344%else 345 ; 8x8 block and sse2, stride was provided 346%define OFFSET r6 347 add r6, r5 348%endif 349 movh m6, [%3 + OFFSET] 350 movh m7, [%4 + OFFSET] 351 352%if %1 == 0 353 ; 14bits weights 354 punpcklbw m4, m0 355 punpcklbw m5, m0 356 punpcklbw m6, m0 357 punpcklbw m7, m0 358 359 psllw m4, 7 360 psllw m5, 7 361 psllw m6, 7 362 psllw m7, 7 363 pmulhw m4, m3 364 pmulhw m5, m2 365 pmulhw m6, m3 366 pmulhw m7, m2 367 368 paddw m4, m5 369 paddw m6, m7 370%else 371 ; 5bits weights 372%if cpuflag(ssse3) 373 punpcklbw m4, m5 374 punpcklbw m6, m7 375 376 pmaddubsw m4, m3 377 pmaddubsw m6, m3 378%else 379 punpcklbw m4, m0 380 punpcklbw m5, m0 381 punpcklbw m6, m0 382 punpcklbw m7, m0 383 384 pmullw m4, m3 385 pmullw m5, m2 386 pmullw m6, m3 387 pmullw m7, m2 388 paddw m4, m5 389 paddw m6, m7 390%endif 391 392%endif 393 394 ; bias and shift down 395%if cpuflag(ssse3) 396 pmulhrsw m4, m1 397 pmulhrsw m6, m1 398%else 399 paddw m4, m1 400 paddw m6, m1 401 psrlw m4, 5 402 psrlw m6, 5 403%endif 404 405 packuswb m4, m6 406%if %0 == 5 407 ; Only called for 8x8 blocks and sse2 408 sub r6, r5 409 movh [%2 + r6], m4 410 add r6, r5 411 movhps [%2 + r6], m4 412%else 413 mova [%2 + r6], m4 414%endif 415%endmacro 416 417 418%macro MAIN_LOOP 2 419%if mmsize == 8 420 RV40_WCORE %2, r0, r1, r2 421%if %1 == 16 422 RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8 423%endif 424 425 ; Prepare for next loop 426 add r6, r5 427%else 428%ifidn %1, 8 429 RV40_WCORE %2, r0, r1, r2, r5 430 ; Prepare 2 next lines 431 add r6, r5 432%else 433 RV40_WCORE %2, r0, r1, r2 434 ; Prepare single next line 435 add r6, r5 436%endif 437%endif 438 439%endmacro 440 441; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) 442; %1=size %2=num of xmm regs 443; The weights are FP0.14 notation of fractions depending on pts. 444; For timebases without rounding error (i.e. PAL), the fractions 445; can be simplified, and several operations can be avoided. 446; Therefore, we check here whether they are multiples of 2^9 for 447; those simplifications to occur. 448%macro RV40_WEIGHT 3 449cglobal rv40_weight_func_%1_%2, 6, 7, 8 450%if cpuflag(ssse3) 451 mova m1, [pw_1024] 452%else 453 mova m1, [pw_16] 454%endif 455 pxor m0, m0 456 ; Set loop counter and increments 457 mov r6, r5 458 shl r6, %3 459 add r0, r6 460 add r1, r6 461 add r2, r6 462 neg r6 463 464 movd m2, r3d 465 movd m3, r4d 466%ifidn %1,rnd 467%define RND 0 468 SPLATW m2, m2 469%else 470%define RND 1 471%if cpuflag(ssse3) 472 punpcklbw m3, m2 473%else 474 SPLATW m2, m2 475%endif 476%endif 477 SPLATW m3, m3 478 479.loop: 480 MAIN_LOOP %2, RND 481 jnz .loop 482 REP_RET 483%endmacro 484 485INIT_MMX mmxext 486RV40_WEIGHT rnd, 8, 3 487RV40_WEIGHT rnd, 16, 4 488RV40_WEIGHT nornd, 8, 3 489RV40_WEIGHT nornd, 16, 4 490 491INIT_XMM sse2 492RV40_WEIGHT rnd, 8, 3 493RV40_WEIGHT rnd, 16, 4 494RV40_WEIGHT nornd, 8, 3 495RV40_WEIGHT nornd, 16, 4 496 497INIT_XMM ssse3 498RV40_WEIGHT rnd, 8, 3 499RV40_WEIGHT rnd, 16, 4 500RV40_WEIGHT nornd, 8, 3 501RV40_WEIGHT nornd, 16, 4 502