1/* 2 * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> 3 * 4 * This file is part of Libav. 5 * 6 * Libav is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * Libav is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License along 17 * with Libav; if not, write to the Free Software Foundation, Inc., 18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 */ 20 21#ifdef COMPILE_TEMPLATE_SSE 22#define MM "%%xmm" 23#define MOV "movq" 24#define MOVQ "movdqa" 25#define MOVQU "movdqu" 26#define STEP 8 27#define LOAD(mem,dst) \ 28 MOV" "mem", "dst" \n\t"\ 29 "punpcklbw "MM"7, "dst" \n\t" 30#define PSRL1(reg) "psrldq $1, "reg" \n\t" 31#define PSRL2(reg) "psrldq $2, "reg" \n\t" 32#define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\ 33 "psrldq $2, "src" \n\t" 34#else 35#define MM "%%mm" 36#define MOV "movd" 37#define MOVQ "movq" 38#define MOVQU "movq" 39#define STEP 4 40#define LOAD(mem,dst) \ 41 MOV" "mem", "dst" \n\t"\ 42 "punpcklbw "MM"7, "dst" \n\t" 43#define PSRL1(reg) "psrlq $8, "reg" \n\t" 44#define PSRL2(reg) "psrlq $16, "reg" \n\t" 45#define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t" 46#endif 47 48#ifdef COMPILE_TEMPLATE_SSSE3 49#define PABS(tmp,dst) \ 50 "pabsw "dst", "dst" \n\t" 51#else 52#define PABS(tmp,dst) \ 53 "pxor "tmp", "tmp" \n\t"\ 54 "psubw "dst", "tmp" \n\t"\ 55 "pmaxsw "tmp", "dst" \n\t" 56#endif 57 58#define CHECK(pj,mj) \ 59 MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\ 60 MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\ 61 MOVQ" "MM"2, "MM"4 \n\t"\ 62 MOVQ" "MM"2, "MM"5 \n\t"\ 63 "pxor "MM"3, "MM"4 \n\t"\ 64 "pavgb "MM"3, "MM"5 \n\t"\ 65 "pand "MANGLE(pb_1)", "MM"4 \n\t"\ 66 "psubusb "MM"4, "MM"5 \n\t"\ 67 PSRL1(MM"5") \ 68 "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\ 69 MOVQ" "MM"2, "MM"4 \n\t"\ 70 "psubusb "MM"3, "MM"2 \n\t"\ 71 "psubusb "MM"4, "MM"3 \n\t"\ 72 "pmaxub "MM"3, "MM"2 \n\t"\ 73 MOVQ" "MM"2, "MM"3 \n\t"\ 74 MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\ 75 PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\ 76 PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\ 77 "punpcklbw "MM"7, "MM"2 \n\t"\ 78 "punpcklbw "MM"7, "MM"3 \n\t"\ 79 "punpcklbw "MM"7, "MM"4 \n\t"\ 80 "paddw "MM"3, "MM"2 \n\t"\ 81 "paddw "MM"4, "MM"2 \n\t" /* score */ 82 83#define CHECK1 \ 84 MOVQ" "MM"0, "MM"3 \n\t"\ 85 "pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\ 86 "pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\ 87 MOVQ" "MM"3, "MM"6 \n\t"\ 88 "pand "MM"3, "MM"5 \n\t"\ 89 "pandn "MM"1, "MM"3 \n\t"\ 90 "por "MM"5, "MM"3 \n\t"\ 91 MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */ 92 93#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\ 94 hurts both quality and speed, but matches the C version. */\ 95 "paddw "MANGLE(pw_1)", "MM"6 \n\t"\ 96 "psllw $14, "MM"6 \n\t"\ 97 "paddsw "MM"6, "MM"2 \n\t"\ 98 MOVQ" "MM"0, "MM"3 \n\t"\ 99 "pcmpgtw "MM"2, "MM"3 \n\t"\ 100 "pminsw "MM"2, "MM"0 \n\t"\ 101 "pand "MM"3, "MM"5 \n\t"\ 102 "pandn "MM"1, "MM"3 \n\t"\ 103 "por "MM"5, "MM"3 \n\t"\ 104 MOVQ" "MM"3, "MM"1 \n\t" 105 106void RENAME(ff_yadif_filter_line)(uint8_t *dst, 107 uint8_t *prev, uint8_t *cur, uint8_t *next, 108 int w, int prefs, int mrefs, int parity, int mode) 109{ 110 DECLARE_ALIGNED(16, uint8_t, tmp0[16]); 111 DECLARE_ALIGNED(16, uint8_t, tmp1[16]); 112 DECLARE_ALIGNED(16, uint8_t, tmp2[16]); 113 DECLARE_ALIGNED(16, uint8_t, tmp3[16]); 114 int x; 115 116#define FILTER\ 117 for(x=0; x<w; x+=STEP){\ 118 __asm__ volatile(\ 119 "pxor "MM"7, "MM"7 \n\t"\ 120 LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\ 121 LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\ 122 LOAD("(%["prev2"])", MM"2") /* prev2[x] */\ 123 LOAD("(%["next2"])", MM"3") /* next2[x] */\ 124 MOVQ" "MM"3, "MM"4 \n\t"\ 125 "paddw "MM"2, "MM"3 \n\t"\ 126 "psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\ 127 MOVQ" "MM"0, %[tmp0] \n\t" /* c */\ 128 MOVQ" "MM"3, %[tmp1] \n\t" /* d */\ 129 MOVQ" "MM"1, %[tmp2] \n\t" /* e */\ 130 "psubw "MM"4, "MM"2 \n\t"\ 131 PABS( MM"4", MM"2") /* temporal_diff0 */\ 132 LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\ 133 LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\ 134 "psubw "MM"0, "MM"3 \n\t"\ 135 "psubw "MM"1, "MM"4 \n\t"\ 136 PABS( MM"5", MM"3")\ 137 PABS( MM"5", MM"4")\ 138 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\ 139 "psrlw $1, "MM"2 \n\t"\ 140 "psrlw $1, "MM"3 \n\t"\ 141 "pmaxsw "MM"3, "MM"2 \n\t"\ 142 LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\ 143 LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\ 144 "psubw "MM"0, "MM"3 \n\t"\ 145 "psubw "MM"1, "MM"4 \n\t"\ 146 PABS( MM"5", MM"3")\ 147 PABS( MM"5", MM"4")\ 148 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\ 149 "psrlw $1, "MM"3 \n\t"\ 150 "pmaxsw "MM"3, "MM"2 \n\t"\ 151 MOVQ" "MM"2, %[tmp3] \n\t" /* diff */\ 152\ 153 "paddw "MM"0, "MM"1 \n\t"\ 154 "paddw "MM"0, "MM"0 \n\t"\ 155 "psubw "MM"1, "MM"0 \n\t"\ 156 "psrlw $1, "MM"1 \n\t" /* spatial_pred */\ 157 PABS( MM"2", MM"0") /* ABS(c-e) */\ 158\ 159 MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\ 160 MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\ 161 MOVQ" "MM"2, "MM"4 \n\t"\ 162 "psubusb "MM"3, "MM"2 \n\t"\ 163 "psubusb "MM"4, "MM"3 \n\t"\ 164 "pmaxub "MM"3, "MM"2 \n\t"\ 165 PSHUF(MM"3", MM"2") \ 166 "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\ 167 "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\ 168 "paddw "MM"2, "MM"0 \n\t"\ 169 "paddw "MM"3, "MM"0 \n\t"\ 170 "psubw "MANGLE(pw_1)", "MM"0 \n\t" /* spatial_score */\ 171\ 172 CHECK(-2,0)\ 173 CHECK1\ 174 CHECK(-3,1)\ 175 CHECK2\ 176 CHECK(0,-2)\ 177 CHECK1\ 178 CHECK(1,-3)\ 179 CHECK2\ 180\ 181 /* if(p->mode<2) ... */\ 182 MOVQ" %[tmp3], "MM"6 \n\t" /* diff */\ 183 "cmpl $2, %[mode] \n\t"\ 184 "jge 1f \n\t"\ 185 LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\ 186 LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\ 187 LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\ 188 LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\ 189 "paddw "MM"4, "MM"2 \n\t"\ 190 "paddw "MM"5, "MM"3 \n\t"\ 191 "psrlw $1, "MM"2 \n\t" /* b */\ 192 "psrlw $1, "MM"3 \n\t" /* f */\ 193 MOVQ" %[tmp0], "MM"4 \n\t" /* c */\ 194 MOVQ" %[tmp1], "MM"5 \n\t" /* d */\ 195 MOVQ" %[tmp2], "MM"7 \n\t" /* e */\ 196 "psubw "MM"4, "MM"2 \n\t" /* b-c */\ 197 "psubw "MM"7, "MM"3 \n\t" /* f-e */\ 198 MOVQ" "MM"5, "MM"0 \n\t"\ 199 "psubw "MM"4, "MM"5 \n\t" /* d-c */\ 200 "psubw "MM"7, "MM"0 \n\t" /* d-e */\ 201 MOVQ" "MM"2, "MM"4 \n\t"\ 202 "pminsw "MM"3, "MM"2 \n\t"\ 203 "pmaxsw "MM"4, "MM"3 \n\t"\ 204 "pmaxsw "MM"5, "MM"2 \n\t"\ 205 "pminsw "MM"5, "MM"3 \n\t"\ 206 "pmaxsw "MM"0, "MM"2 \n\t" /* max */\ 207 "pminsw "MM"0, "MM"3 \n\t" /* min */\ 208 "pxor "MM"4, "MM"4 \n\t"\ 209 "pmaxsw "MM"3, "MM"6 \n\t"\ 210 "psubw "MM"2, "MM"4 \n\t" /* -max */\ 211 "pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\ 212 "1: \n\t"\ 213\ 214 MOVQ" %[tmp1], "MM"2 \n\t" /* d */\ 215 MOVQ" "MM"2, "MM"3 \n\t"\ 216 "psubw "MM"6, "MM"2 \n\t" /* d-diff */\ 217 "paddw "MM"6, "MM"3 \n\t" /* d+diff */\ 218 "pmaxsw "MM"2, "MM"1 \n\t"\ 219 "pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\ 220 "packuswb "MM"1, "MM"1 \n\t"\ 221\ 222 :[tmp0]"=m"(tmp0),\ 223 [tmp1]"=m"(tmp1),\ 224 [tmp2]"=m"(tmp2),\ 225 [tmp3]"=m"(tmp3)\ 226 :[prev] "r"(prev),\ 227 [cur] "r"(cur),\ 228 [next] "r"(next),\ 229 [prefs]"r"((x86_reg)prefs),\ 230 [mrefs]"r"((x86_reg)mrefs),\ 231 [mode] "g"(mode)\ 232 );\ 233 __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\ 234 dst += STEP;\ 235 prev+= STEP;\ 236 cur += STEP;\ 237 next+= STEP;\ 238 } 239 240 if (parity) { 241#define prev2 "prev" 242#define next2 "cur" 243 FILTER 244#undef prev2 245#undef next2 246 } else { 247#define prev2 "cur" 248#define next2 "next" 249 FILTER 250#undef prev2 251#undef next2 252 } 253} 254#undef STEP 255#undef MM 256#undef MOV 257#undef MOVQ 258#undef MOVQU 259#undef PSHUF 260#undef PSRL1 261#undef PSRL2 262#undef LOAD 263#undef PABS 264#undef CHECK 265#undef CHECK1 266#undef CHECK2 267#undef FILTER 268 269