1;****************************************************************************** 2;* VP9 MC SIMD optimizations 3;* 4;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27; FIXME share with vp8dsp.asm 28pw_256: times 8 dw 256 29 30%macro F8_TAPS 8 31times 8 db %1, %2 32times 8 db %3, %4 33times 8 db %5, %6 34times 8 db %7, %8 35%endmacro 36; int8_t ff_filters_ssse3[3][15][4][16] 37const filters_ssse3 ; smooth 38 F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0 39 F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0 40 F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0 41 F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0 42 F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0 43 F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0 44 F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1 45 F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1 46 F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1 47 F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1 48 F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2 49 F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2 50 F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2 51 F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2 52 F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3 53 ; regular 54 F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0 55 F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0 56 F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1 57 F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1 58 F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1 59 F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1 60 F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1 61 F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1 62 F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1 63 F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1 64 F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1 65 F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1 66 F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1 67 F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1 68 F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0 69 ; sharp 70 F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0 71 F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1 72 F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2 73 F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2 74 F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3 75 F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3 76 F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4 77 F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4 78 F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4 79 F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4 80 F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4 81 F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4 82 F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3 83 F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2 84 F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1 85 86SECTION .text 87 88%macro filter_h_fn 1 89%assign %%px mmsize/2 90cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery 91 mova m6, [pw_256] 92 mova m7, [filteryq+ 0] 93%if ARCH_X86_64 && mmsize > 8 94 mova m8, [filteryq+16] 95 mova m9, [filteryq+32] 96 mova m10, [filteryq+48] 97%endif 98.loop: 99 movh m0, [srcq-3] 100 movh m1, [srcq-2] 101 movh m2, [srcq-1] 102 movh m3, [srcq+0] 103 movh m4, [srcq+1] 104 movh m5, [srcq+2] 105 punpcklbw m0, m1 106 punpcklbw m2, m3 107 movh m1, [srcq+3] 108 movh m3, [srcq+4] 109 add srcq, sstrideq 110 punpcklbw m4, m5 111 punpcklbw m1, m3 112 pmaddubsw m0, m7 113%if ARCH_X86_64 && mmsize > 8 114 pmaddubsw m2, m8 115 pmaddubsw m4, m9 116 pmaddubsw m1, m10 117%else 118 pmaddubsw m2, [filteryq+16] 119 pmaddubsw m4, [filteryq+32] 120 pmaddubsw m1, [filteryq+48] 121%endif 122 paddw m0, m2 123 paddw m4, m1 124 paddsw m0, m4 125 pmulhrsw m0, m6 126%ifidn %1, avg 127 movh m1, [dstq] 128%endif 129 packuswb m0, m0 130%ifidn %1, avg 131 pavgb m0, m1 132%endif 133 movh [dstq], m0 134 add dstq, dstrideq 135 dec hd 136 jg .loop 137 RET 138%endmacro 139 140INIT_MMX ssse3 141filter_h_fn put 142filter_h_fn avg 143 144INIT_XMM ssse3 145filter_h_fn put 146filter_h_fn avg 147 148%if ARCH_X86_64 149%macro filter_hx2_fn 1 150%assign %%px mmsize 151cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery 152 mova m13, [pw_256] 153 mova m8, [filteryq+ 0] 154 mova m9, [filteryq+16] 155 mova m10, [filteryq+32] 156 mova m11, [filteryq+48] 157.loop: 158 movu m0, [srcq-3] 159 movu m1, [srcq-2] 160 movu m2, [srcq-1] 161 movu m3, [srcq+0] 162 movu m4, [srcq+1] 163 movu m5, [srcq+2] 164 movu m6, [srcq+3] 165 movu m7, [srcq+4] 166 add srcq, sstrideq 167 SBUTTERFLY bw, 0, 1, 12 168 SBUTTERFLY bw, 2, 3, 12 169 SBUTTERFLY bw, 4, 5, 12 170 SBUTTERFLY bw, 6, 7, 12 171 pmaddubsw m0, m8 172 pmaddubsw m1, m8 173 pmaddubsw m2, m9 174 pmaddubsw m3, m9 175 pmaddubsw m4, m10 176 pmaddubsw m5, m10 177 pmaddubsw m6, m11 178 pmaddubsw m7, m11 179 paddw m0, m2 180 paddw m1, m3 181 paddw m4, m6 182 paddw m5, m7 183 paddsw m0, m4 184 paddsw m1, m5 185 pmulhrsw m0, m13 186 pmulhrsw m1, m13 187 packuswb m0, m1 188%ifidn %1, avg 189 pavgb m0, [dstq] 190%endif 191 mova [dstq], m0 192 add dstq, dstrideq 193 dec hd 194 jg .loop 195 RET 196%endmacro 197 198INIT_XMM ssse3 199filter_hx2_fn put 200filter_hx2_fn avg 201 202%endif ; ARCH_X86_64 203 204%macro filter_v_fn 1 205%assign %%px mmsize/2 206%if ARCH_X86_64 207cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3 208%else 209cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3 210 mov filteryq, r5mp 211%define hd r4mp 212%endif 213 mova m6, [pw_256] 214 lea sstride3q, [sstrideq*3] 215 lea src4q, [srcq+sstrideq] 216 sub srcq, sstride3q 217 mova m7, [filteryq+ 0] 218%if ARCH_X86_64 && mmsize > 8 219 mova m8, [filteryq+16] 220 mova m9, [filteryq+32] 221 mova m10, [filteryq+48] 222%endif 223.loop: 224 ; FIXME maybe reuse loads from previous rows, or just 225 ; more generally unroll this to prevent multiple loads of 226 ; the same data? 227 movh m0, [srcq] 228 movh m1, [srcq+sstrideq] 229 movh m2, [srcq+sstrideq*2] 230 movh m3, [srcq+sstride3q] 231 movh m4, [src4q] 232 movh m5, [src4q+sstrideq] 233 punpcklbw m0, m1 234 punpcklbw m2, m3 235 movh m1, [src4q+sstrideq*2] 236 movh m3, [src4q+sstride3q] 237 add srcq, sstrideq 238 add src4q, sstrideq 239 punpcklbw m4, m5 240 punpcklbw m1, m3 241 pmaddubsw m0, m7 242%if ARCH_X86_64 && mmsize > 8 243 pmaddubsw m2, m8 244 pmaddubsw m4, m9 245 pmaddubsw m1, m10 246%else 247 pmaddubsw m2, [filteryq+16] 248 pmaddubsw m4, [filteryq+32] 249 pmaddubsw m1, [filteryq+48] 250%endif 251 paddw m0, m2 252 paddw m4, m1 253 paddsw m0, m4 254 pmulhrsw m0, m6 255%ifidn %1, avg 256 movh m1, [dstq] 257%endif 258 packuswb m0, m0 259%ifidn %1, avg 260 pavgb m0, m1 261%endif 262 movh [dstq], m0 263 add dstq, dstrideq 264 dec hd 265 jg .loop 266 RET 267%endmacro 268 269INIT_MMX ssse3 270filter_v_fn put 271filter_v_fn avg 272 273INIT_XMM ssse3 274filter_v_fn put 275filter_v_fn avg 276 277%if ARCH_X86_64 278 279%macro filter_vx2_fn 1 280%assign %%px mmsize 281cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3 282 mova m13, [pw_256] 283 lea sstride3q, [sstrideq*3] 284 lea src4q, [srcq+sstrideq] 285 sub srcq, sstride3q 286 mova m8, [filteryq+ 0] 287 mova m9, [filteryq+16] 288 mova m10, [filteryq+32] 289 mova m11, [filteryq+48] 290.loop: 291 ; FIXME maybe reuse loads from previous rows, or just 292 ; more generally unroll this to prevent multiple loads of 293 ; the same data? 294 movu m0, [srcq] 295 movu m1, [srcq+sstrideq] 296 movu m2, [srcq+sstrideq*2] 297 movu m3, [srcq+sstride3q] 298 movu m4, [src4q] 299 movu m5, [src4q+sstrideq] 300 movu m6, [src4q+sstrideq*2] 301 movu m7, [src4q+sstride3q] 302 add srcq, sstrideq 303 add src4q, sstrideq 304 SBUTTERFLY bw, 0, 1, 12 305 SBUTTERFLY bw, 2, 3, 12 306 SBUTTERFLY bw, 4, 5, 12 307 SBUTTERFLY bw, 6, 7, 12 308 pmaddubsw m0, m8 309 pmaddubsw m1, m8 310 pmaddubsw m2, m9 311 pmaddubsw m3, m9 312 pmaddubsw m4, m10 313 pmaddubsw m5, m10 314 pmaddubsw m6, m11 315 pmaddubsw m7, m11 316 paddw m0, m2 317 paddw m1, m3 318 paddw m4, m6 319 paddw m5, m7 320 paddsw m0, m4 321 paddsw m1, m5 322 pmulhrsw m0, m13 323 pmulhrsw m1, m13 324 packuswb m0, m1 325%ifidn %1, avg 326 pavgb m0, [dstq] 327%endif 328 mova [dstq], m0 329 add dstq, dstrideq 330 dec hd 331 jg .loop 332 RET 333%endmacro 334 335INIT_XMM ssse3 336filter_vx2_fn put 337filter_vx2_fn avg 338 339%endif ; ARCH_X86_64 340 341%macro fpel_fn 6 342%if %2 == 4 343%define %%srcfn movh 344%define %%dstfn movh 345%else 346%define %%srcfn movu 347%define %%dstfn mova 348%endif 349 350%if %2 <= 16 351cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 352 lea sstride3q, [sstrideq*3] 353 lea dstride3q, [dstrideq*3] 354%else 355cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h 356%endif 357.loop: 358 %%srcfn m0, [srcq] 359 %%srcfn m1, [srcq+s%3] 360 %%srcfn m2, [srcq+s%4] 361 %%srcfn m3, [srcq+s%5] 362 lea srcq, [srcq+sstrideq*%6] 363%ifidn %1, avg 364 pavgb m0, [dstq] 365 pavgb m1, [dstq+d%3] 366 pavgb m2, [dstq+d%4] 367 pavgb m3, [dstq+d%5] 368%endif 369 %%dstfn [dstq], m0 370 %%dstfn [dstq+d%3], m1 371 %%dstfn [dstq+d%4], m2 372 %%dstfn [dstq+d%5], m3 373 lea dstq, [dstq+dstrideq*%6] 374 sub hd, %6 375 jnz .loop 376 RET 377%endmacro 378 379%define d16 16 380%define s16 16 381INIT_MMX mmx 382fpel_fn put, 4, strideq, strideq*2, stride3q, 4 383fpel_fn put, 8, strideq, strideq*2, stride3q, 4 384INIT_MMX mmxext 385fpel_fn avg, 4, strideq, strideq*2, stride3q, 4 386fpel_fn avg, 8, strideq, strideq*2, stride3q, 4 387INIT_XMM sse 388fpel_fn put, 16, strideq, strideq*2, stride3q, 4 389fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2 390fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1 391INIT_XMM sse2 392fpel_fn avg, 16, strideq, strideq*2, stride3q, 4 393fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2 394fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1 395%undef s16 396%undef d16 397