1/*
2 * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with Libav; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21#ifdef COMPILE_TEMPLATE_SSE
22#define MM "%%xmm"
23#define MOV  "movq"
24#define MOVQ "movdqa"
25#define MOVQU "movdqu"
26#define STEP 8
27#define LOAD(mem,dst) \
28            MOV"       "mem", "dst" \n\t"\
29            "punpcklbw "MM"7, "dst" \n\t"
30#define PSRL1(reg) "psrldq $1, "reg" \n\t"
31#define PSRL2(reg) "psrldq $2, "reg" \n\t"
32#define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
33                       "psrldq $2, "src"     \n\t"
34#else
35#define MM "%%mm"
36#define MOV  "movd"
37#define MOVQ "movq"
38#define MOVQU "movq"
39#define STEP 4
40#define LOAD(mem,dst) \
41            MOV"       "mem", "dst" \n\t"\
42            "punpcklbw "MM"7, "dst" \n\t"
43#define PSRL1(reg) "psrlq $8, "reg" \n\t"
44#define PSRL2(reg) "psrlq $16, "reg" \n\t"
45#define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
46#endif
47
48#ifdef COMPILE_TEMPLATE_SSSE3
49#define PABS(tmp,dst) \
50            "pabsw     "dst", "dst" \n\t"
51#else
52#define PABS(tmp,dst) \
53            "pxor     "tmp", "tmp" \n\t"\
54            "psubw    "dst", "tmp" \n\t"\
55            "pmaxsw   "tmp", "dst" \n\t"
56#endif
57
58#define CHECK(pj,mj) \
59            MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
60            MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
61            MOVQ"      "MM"2, "MM"4 \n\t"\
62            MOVQ"      "MM"2, "MM"5 \n\t"\
63            "pxor      "MM"3, "MM"4 \n\t"\
64            "pavgb     "MM"3, "MM"5 \n\t"\
65            "pand     "MANGLE(pb_1)", "MM"4 \n\t"\
66            "psubusb   "MM"4, "MM"5 \n\t"\
67            PSRL1(MM"5")                 \
68            "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
69            MOVQ"      "MM"2, "MM"4 \n\t"\
70            "psubusb   "MM"3, "MM"2 \n\t"\
71            "psubusb   "MM"4, "MM"3 \n\t"\
72            "pmaxub    "MM"3, "MM"2 \n\t"\
73            MOVQ"      "MM"2, "MM"3 \n\t"\
74            MOVQ"      "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
75            PSRL1(MM"3")                  /* ABS(cur[x-refs  +j] - cur[x+refs  -j]) */\
76            PSRL2(MM"4")                  /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
77            "punpcklbw "MM"7, "MM"2 \n\t"\
78            "punpcklbw "MM"7, "MM"3 \n\t"\
79            "punpcklbw "MM"7, "MM"4 \n\t"\
80            "paddw     "MM"3, "MM"2 \n\t"\
81            "paddw     "MM"4, "MM"2 \n\t" /* score */
82
83#define CHECK1 \
84            MOVQ"      "MM"0, "MM"3 \n\t"\
85            "pcmpgtw   "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
86            "pminsw    "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
87            MOVQ"      "MM"3, "MM"6 \n\t"\
88            "pand      "MM"3, "MM"5 \n\t"\
89            "pandn     "MM"1, "MM"3 \n\t"\
90            "por       "MM"5, "MM"3 \n\t"\
91            MOVQ"      "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
92
93#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
94                  hurts both quality and speed, but matches the C version. */\
95            "paddw    "MANGLE(pw_1)", "MM"6 \n\t"\
96            "psllw     $14,   "MM"6 \n\t"\
97            "paddsw    "MM"6, "MM"2 \n\t"\
98            MOVQ"      "MM"0, "MM"3 \n\t"\
99            "pcmpgtw   "MM"2, "MM"3 \n\t"\
100            "pminsw    "MM"2, "MM"0 \n\t"\
101            "pand      "MM"3, "MM"5 \n\t"\
102            "pandn     "MM"1, "MM"3 \n\t"\
103            "por       "MM"5, "MM"3 \n\t"\
104            MOVQ"      "MM"3, "MM"1 \n\t"
105
106void RENAME(ff_yadif_filter_line)(uint8_t *dst,
107                                  uint8_t *prev, uint8_t *cur, uint8_t *next,
108                                  int w, int prefs, int mrefs, int parity, int mode)
109{
110    DECLARE_ALIGNED(16, uint8_t, tmp0[16]);
111    DECLARE_ALIGNED(16, uint8_t, tmp1[16]);
112    DECLARE_ALIGNED(16, uint8_t, tmp2[16]);
113    DECLARE_ALIGNED(16, uint8_t, tmp3[16]);
114    int x;
115
116#define FILTER\
117    for(x=0; x<w; x+=STEP){\
118        __asm__ volatile(\
119            "pxor      "MM"7, "MM"7 \n\t"\
120            LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
121            LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
122            LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
123            LOAD("(%["next2"])", MM"3") /* next2[x] */\
124            MOVQ"      "MM"3, "MM"4 \n\t"\
125            "paddw     "MM"2, "MM"3 \n\t"\
126            "psraw     $1,    "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
127            MOVQ"      "MM"0, %[tmp0] \n\t" /* c */\
128            MOVQ"      "MM"3, %[tmp1] \n\t" /* d */\
129            MOVQ"      "MM"1, %[tmp2] \n\t" /* e */\
130            "psubw     "MM"4, "MM"2 \n\t"\
131            PABS(      MM"4", MM"2") /* temporal_diff0 */\
132            LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
133            LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
134            "psubw     "MM"0, "MM"3 \n\t"\
135            "psubw     "MM"1, "MM"4 \n\t"\
136            PABS(      MM"5", MM"3")\
137            PABS(      MM"5", MM"4")\
138            "paddw     "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
139            "psrlw     $1,    "MM"2 \n\t"\
140            "psrlw     $1,    "MM"3 \n\t"\
141            "pmaxsw    "MM"3, "MM"2 \n\t"\
142            LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
143            LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
144            "psubw     "MM"0, "MM"3 \n\t"\
145            "psubw     "MM"1, "MM"4 \n\t"\
146            PABS(      MM"5", MM"3")\
147            PABS(      MM"5", MM"4")\
148            "paddw     "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
149            "psrlw     $1,    "MM"3 \n\t"\
150            "pmaxsw    "MM"3, "MM"2 \n\t"\
151            MOVQ"      "MM"2, %[tmp3] \n\t" /* diff */\
152\
153            "paddw     "MM"0, "MM"1 \n\t"\
154            "paddw     "MM"0, "MM"0 \n\t"\
155            "psubw     "MM"1, "MM"0 \n\t"\
156            "psrlw     $1,    "MM"1 \n\t" /* spatial_pred */\
157            PABS(      MM"2", MM"0")      /* ABS(c-e) */\
158\
159            MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
160            MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
161            MOVQ"      "MM"2, "MM"4 \n\t"\
162            "psubusb   "MM"3, "MM"2 \n\t"\
163            "psubusb   "MM"4, "MM"3 \n\t"\
164            "pmaxub    "MM"3, "MM"2 \n\t"\
165            PSHUF(MM"3", MM"2") \
166            "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
167            "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
168            "paddw     "MM"2, "MM"0 \n\t"\
169            "paddw     "MM"3, "MM"0 \n\t"\
170            "psubw    "MANGLE(pw_1)", "MM"0 \n\t" /* spatial_score */\
171\
172            CHECK(-2,0)\
173            CHECK1\
174            CHECK(-3,1)\
175            CHECK2\
176            CHECK(0,-2)\
177            CHECK1\
178            CHECK(1,-3)\
179            CHECK2\
180\
181            /* if(p->mode<2) ... */\
182            MOVQ"    %[tmp3], "MM"6 \n\t" /* diff */\
183            "cmpl      $2, %[mode] \n\t"\
184            "jge       1f \n\t"\
185            LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
186            LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
187            LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
188            LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
189            "paddw     "MM"4, "MM"2 \n\t"\
190            "paddw     "MM"5, "MM"3 \n\t"\
191            "psrlw     $1,    "MM"2 \n\t" /* b */\
192            "psrlw     $1,    "MM"3 \n\t" /* f */\
193            MOVQ"    %[tmp0], "MM"4 \n\t" /* c */\
194            MOVQ"    %[tmp1], "MM"5 \n\t" /* d */\
195            MOVQ"    %[tmp2], "MM"7 \n\t" /* e */\
196            "psubw     "MM"4, "MM"2 \n\t" /* b-c */\
197            "psubw     "MM"7, "MM"3 \n\t" /* f-e */\
198            MOVQ"      "MM"5, "MM"0 \n\t"\
199            "psubw     "MM"4, "MM"5 \n\t" /* d-c */\
200            "psubw     "MM"7, "MM"0 \n\t" /* d-e */\
201            MOVQ"      "MM"2, "MM"4 \n\t"\
202            "pminsw    "MM"3, "MM"2 \n\t"\
203            "pmaxsw    "MM"4, "MM"3 \n\t"\
204            "pmaxsw    "MM"5, "MM"2 \n\t"\
205            "pminsw    "MM"5, "MM"3 \n\t"\
206            "pmaxsw    "MM"0, "MM"2 \n\t" /* max */\
207            "pminsw    "MM"0, "MM"3 \n\t" /* min */\
208            "pxor      "MM"4, "MM"4 \n\t"\
209            "pmaxsw    "MM"3, "MM"6 \n\t"\
210            "psubw     "MM"2, "MM"4 \n\t" /* -max */\
211            "pmaxsw    "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
212            "1: \n\t"\
213\
214            MOVQ"    %[tmp1], "MM"2 \n\t" /* d */\
215            MOVQ"      "MM"2, "MM"3 \n\t"\
216            "psubw     "MM"6, "MM"2 \n\t" /* d-diff */\
217            "paddw     "MM"6, "MM"3 \n\t" /* d+diff */\
218            "pmaxsw    "MM"2, "MM"1 \n\t"\
219            "pminsw    "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
220            "packuswb  "MM"1, "MM"1 \n\t"\
221\
222            :[tmp0]"=m"(tmp0),\
223             [tmp1]"=m"(tmp1),\
224             [tmp2]"=m"(tmp2),\
225             [tmp3]"=m"(tmp3)\
226            :[prev] "r"(prev),\
227             [cur]  "r"(cur),\
228             [next] "r"(next),\
229             [prefs]"r"((x86_reg)prefs),\
230             [mrefs]"r"((x86_reg)mrefs),\
231             [mode] "g"(mode)\
232        );\
233        __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\
234        dst += STEP;\
235        prev+= STEP;\
236        cur += STEP;\
237        next+= STEP;\
238    }
239
240    if (parity) {
241#define prev2 "prev"
242#define next2 "cur"
243        FILTER
244#undef prev2
245#undef next2
246    } else {
247#define prev2 "cur"
248#define next2 "next"
249        FILTER
250#undef prev2
251#undef next2
252    }
253}
254#undef STEP
255#undef MM
256#undef MOV
257#undef MOVQ
258#undef MOVQU
259#undef PSHUF
260#undef PSRL1
261#undef PSRL2
262#undef LOAD
263#undef PABS
264#undef CHECK
265#undef CHECK1
266#undef CHECK2
267#undef FILTER
268
269