1;***************************************************************************** 2;* x86-optimized functions for yadif filter 3;* 4;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> 5;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com> 6;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29pw_1: times 8 dw 1 30pw_8000: times 8 dw 0x8000 31pd_1: times 4 dd 1 32pd_8000: times 4 dd 0x8000 33 34SECTION .text 35 36%macro PABS 2 37%if cpuflag(ssse3) 38 pabsd %1, %1 39%else 40 pxor %2, %2 41 pcmpgtd %2, %1 42 pxor %1, %2 43 psubd %1, %2 44%endif 45%endmacro 46 47%macro PACK 1 48%if cpuflag(sse4) 49 packusdw %1, %1 50%else 51 psubd %1, [pd_8000] 52 packssdw %1, %1 53 paddw %1, [pw_8000] 54%endif 55%endmacro 56 57%macro PMINSD 3 58%if cpuflag(sse4) 59 pminsd %1, %2 60%else 61 mova %3, %2 62 pcmpgtd %3, %1 63 pand %1, %3 64 pandn %3, %2 65 por %1, %3 66%endif 67%endmacro 68 69%macro PMAXSD 3 70%if cpuflag(sse4) 71 pmaxsd %1, %2 72%else 73 mova %3, %1 74 pcmpgtd %3, %2 75 pand %1, %3 76 pandn %3, %2 77 por %1, %3 78%endif 79%endmacro 80 81%macro PMAXUW 2 82%if cpuflag(sse4) 83 pmaxuw %1, %2 84%else 85 psubusw %1, %2 86 paddusw %1, %2 87%endif 88%endmacro 89 90%macro CHECK 2 91 movu m2, [curq+t1+%1*2] 92 movu m3, [curq+t0+%2*2] 93 mova m4, m2 94 mova m5, m2 95 pxor m4, m3 96 pavgw m5, m3 97 pand m4, [pw_1] 98 psubusw m5, m4 99 RSHIFT m5, 2 100 punpcklwd m5, m7 101 mova m4, m2 102 psubusw m2, m3 103 psubusw m3, m4 104 PMAXUW m2, m3 105 mova m3, m2 106 mova m4, m2 107 RSHIFT m3, 2 108 RSHIFT m4, 4 109 punpcklwd m2, m7 110 punpcklwd m3, m7 111 punpcklwd m4, m7 112 paddd m2, m3 113 paddd m2, m4 114%endmacro 115 116%macro CHECK1 0 117 mova m3, m0 118 pcmpgtd m3, m2 119 PMINSD m0, m2, m6 120 mova m6, m3 121 pand m5, m3 122 pandn m3, m1 123 por m3, m5 124 mova m1, m3 125%endmacro 126 127%macro CHECK2 0 128 paddd m6, [pd_1] 129 pslld m6, 30 130 paddd m2, m6 131 mova m3, m0 132 pcmpgtd m3, m2 133 PMINSD m0, m2, m4 134 pand m5, m3 135 pandn m3, m1 136 por m3, m5 137 mova m1, m3 138%endmacro 139 140; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I 141; am not sure whether it is any faster. A rewrite or refactor of the filter 142; code should make it possible to eliminate the move instruction at the end. It 143; exists to satisfy the expectation that the "score" values are in m1. 144 145; %macro CHECK2 0 146; mova m3, m0 147; pcmpgtd m0, m2 148; pand m0, m6 149; mova m6, m0 150; pand m5, m6 151; pand m2, m0 152; pandn m6, m1 153; pandn m0, m3 154; por m6, m5 155; por m0, m2 156; mova m1, m6 157; %endmacro 158 159%macro LOAD 2 160 movh %1, %2 161 punpcklwd %1, m7 162%endmacro 163 164%macro FILTER 3 165.loop%1: 166 pxor m7, m7 167 LOAD m0, [curq+t1] 168 LOAD m1, [curq+t0] 169 LOAD m2, [%2] 170 LOAD m3, [%3] 171 mova m4, m3 172 paddd m3, m2 173 psrad m3, 1 174 mova [rsp+ 0], m0 175 mova [rsp+16], m3 176 mova [rsp+32], m1 177 psubd m2, m4 178 PABS m2, m4 179 LOAD m3, [prevq+t1] 180 LOAD m4, [prevq+t0] 181 psubd m3, m0 182 psubd m4, m1 183 PABS m3, m5 184 PABS m4, m5 185 paddd m3, m4 186 psrld m2, 1 187 psrld m3, 1 188 PMAXSD m2, m3, m6 189 LOAD m3, [nextq+t1] 190 LOAD m4, [nextq+t0] 191 psubd m3, m0 192 psubd m4, m1 193 PABS m3, m5 194 PABS m4, m5 195 paddd m3, m4 196 psrld m3, 1 197 PMAXSD m2, m3, m6 198 mova [rsp+48], m2 199 200 paddd m1, m0 201 paddd m0, m0 202 psubd m0, m1 203 psrld m1, 1 204 PABS m0, m2 205 206 movu m2, [curq+t1-1*2] 207 movu m3, [curq+t0-1*2] 208 mova m4, m2 209 psubusw m2, m3 210 psubusw m3, m4 211 PMAXUW m2, m3 212 mova m3, m2 213 RSHIFT m3, 4 214 punpcklwd m2, m7 215 punpcklwd m3, m7 216 paddd m0, m2 217 paddd m0, m3 218 psubd m0, [pd_1] 219 220 CHECK -2, 0 221 CHECK1 222 CHECK -3, 1 223 CHECK2 224 CHECK 0, -2 225 CHECK1 226 CHECK 1, -3 227 CHECK2 228 229 mova m6, [rsp+48] 230 cmp DWORD r8m, 2 231 jge .end%1 232 LOAD m2, [%2+t1*2] 233 LOAD m4, [%3+t1*2] 234 LOAD m3, [%2+t0*2] 235 LOAD m5, [%3+t0*2] 236 paddd m2, m4 237 paddd m3, m5 238 psrld m2, 1 239 psrld m3, 1 240 mova m4, [rsp+ 0] 241 mova m5, [rsp+16] 242 mova m7, [rsp+32] 243 psubd m2, m4 244 psubd m3, m7 245 mova m0, m5 246 psubd m5, m4 247 psubd m0, m7 248 mova m4, m2 249 PMINSD m2, m3, m7 250 PMAXSD m3, m4, m7 251 PMAXSD m2, m5, m7 252 PMINSD m3, m5, m7 253 PMAXSD m2, m0, m7 254 PMINSD m3, m0, m7 255 pxor m4, m4 256 PMAXSD m6, m3, m7 257 psubd m4, m2 258 PMAXSD m6, m4, m7 259 260.end%1: 261 mova m2, [rsp+16] 262 mova m3, m2 263 psubd m2, m6 264 paddd m3, m6 265 PMAXSD m1, m2, m7 266 PMINSD m1, m3, m7 267 PACK m1 268 269 movh [dstq], m1 270 add dstq, mmsize/2 271 add prevq, mmsize/2 272 add curq, mmsize/2 273 add nextq, mmsize/2 274 sub DWORD r4m, mmsize/4 275 jg .loop%1 276%endmacro 277 278%macro YADIF 0 279%if ARCH_X86_32 280cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ 281 prefs, mrefs, parity, mode 282%else 283cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \ 284 prefs, mrefs, parity, mode 285%endif 286%if ARCH_X86_32 287 mov r4, r5mp 288 mov r5, r6mp 289 DECLARE_REG_TMP 4,5 290%else 291 movsxd r5, DWORD r5m 292 movsxd r6, DWORD r6m 293 DECLARE_REG_TMP 5,6 294%endif 295 296 cmp DWORD paritym, 0 297 je .parity0 298 FILTER 1, prevq, curq 299 jmp .ret 300 301.parity0: 302 FILTER 0, curq, nextq 303 304.ret: 305 RET 306%endmacro 307 308INIT_XMM sse4 309YADIF 310INIT_XMM ssse3 311YADIF 312INIT_XMM sse2 313YADIF 314%if ARCH_X86_32 315INIT_MMX mmxext 316YADIF 317%endif 318