1/* 2 * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, 3 * Loren Merritt 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22/** 23 * MMX optimized version of (put|avg)_h264_chroma_mc8. 24 * H264_CHROMA_MC8_TMPL must be defined to the desired function name 25 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg 26 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function 27 */ 28static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) 29{ 30 DECLARE_ALIGNED_8(uint64_t, AA); 31 DECLARE_ALIGNED_8(uint64_t, DD); 32 int i; 33 34 if(y==0 && x==0) { 35 /* no filter needed */ 36 H264_CHROMA_MC8_MV0(dst, src, stride, h); 37 return; 38 } 39 40 assert(x<8 && y<8 && x>=0 && y>=0); 41 42 if(y==0 || x==0) 43 { 44 /* 1 dimensional filter only */ 45 const int dxy = x ? 1 : stride; 46 47 __asm__ volatile( 48 "movd %0, %%mm5\n\t" 49 "movq %1, %%mm4\n\t" 50 "movq %2, %%mm6\n\t" /* mm6 = rnd >> 3 */ 51 "punpcklwd %%mm5, %%mm5\n\t" 52 "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ 53 "pxor %%mm7, %%mm7\n\t" 54 "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ 55 :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1))); 56 57 for(i=0; i<h; i++) { 58 __asm__ volatile( 59 /* mm0 = src[0..7], mm1 = src[1..8] */ 60 "movq %0, %%mm0\n\t" 61 "movq %1, %%mm2\n\t" 62 :: "m"(src[0]), "m"(src[dxy])); 63 64 __asm__ volatile( 65 /* [mm0,mm1] = A * src[0..7] */ 66 /* [mm2,mm3] = B * src[1..8] */ 67 "movq %%mm0, %%mm1\n\t" 68 "movq %%mm2, %%mm3\n\t" 69 "punpcklbw %%mm7, %%mm0\n\t" 70 "punpckhbw %%mm7, %%mm1\n\t" 71 "punpcklbw %%mm7, %%mm2\n\t" 72 "punpckhbw %%mm7, %%mm3\n\t" 73 "pmullw %%mm4, %%mm0\n\t" 74 "pmullw %%mm4, %%mm1\n\t" 75 "pmullw %%mm5, %%mm2\n\t" 76 "pmullw %%mm5, %%mm3\n\t" 77 78 /* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */ 79 "paddw %%mm6, %%mm0\n\t" 80 "paddw %%mm6, %%mm1\n\t" 81 "paddw %%mm2, %%mm0\n\t" 82 "paddw %%mm3, %%mm1\n\t" 83 "psrlw $3, %%mm0\n\t" 84 "psrlw $3, %%mm1\n\t" 85 "packuswb %%mm1, %%mm0\n\t" 86 H264_CHROMA_OP(%0, %%mm0) 87 "movq %%mm0, %0\n\t" 88 : "=m" (dst[0])); 89 90 src += stride; 91 dst += stride; 92 } 93 return; 94 } 95 96 /* general case, bilinear */ 97 __asm__ volatile("movd %2, %%mm4\n\t" 98 "movd %3, %%mm6\n\t" 99 "punpcklwd %%mm4, %%mm4\n\t" 100 "punpcklwd %%mm6, %%mm6\n\t" 101 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ 102 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ 103 "movq %%mm4, %%mm5\n\t" 104 "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ 105 "psllw $3, %%mm5\n\t" 106 "psllw $3, %%mm6\n\t" 107 "movq %%mm5, %%mm7\n\t" 108 "paddw %%mm6, %%mm7\n\t" 109 "movq %%mm4, %1\n\t" /* DD = x * y */ 110 "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */ 111 "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */ 112 "paddw %4, %%mm4\n\t" 113 "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */ 114 "pxor %%mm7, %%mm7\n\t" 115 "movq %%mm4, %0\n\t" 116 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); 117 118 __asm__ volatile( 119 /* mm0 = src[0..7], mm1 = src[1..8] */ 120 "movq %0, %%mm0\n\t" 121 "movq %1, %%mm1\n\t" 122 : : "m" (src[0]), "m" (src[1])); 123 124 for(i=0; i<h; i++) { 125 src += stride; 126 127 __asm__ volatile( 128 /* mm2 = A * src[0..3] + B * src[1..4] */ 129 /* mm3 = A * src[4..7] + B * src[5..8] */ 130 "movq %%mm0, %%mm2\n\t" 131 "movq %%mm1, %%mm3\n\t" 132 "punpckhbw %%mm7, %%mm0\n\t" 133 "punpcklbw %%mm7, %%mm1\n\t" 134 "punpcklbw %%mm7, %%mm2\n\t" 135 "punpckhbw %%mm7, %%mm3\n\t" 136 "pmullw %0, %%mm0\n\t" 137 "pmullw %0, %%mm2\n\t" 138 "pmullw %%mm5, %%mm1\n\t" 139 "pmullw %%mm5, %%mm3\n\t" 140 "paddw %%mm1, %%mm2\n\t" 141 "paddw %%mm0, %%mm3\n\t" 142 : : "m" (AA)); 143 144 __asm__ volatile( 145 /* [mm2,mm3] += C * src[0..7] */ 146 "movq %0, %%mm0\n\t" 147 "movq %%mm0, %%mm1\n\t" 148 "punpcklbw %%mm7, %%mm0\n\t" 149 "punpckhbw %%mm7, %%mm1\n\t" 150 "pmullw %%mm6, %%mm0\n\t" 151 "pmullw %%mm6, %%mm1\n\t" 152 "paddw %%mm0, %%mm2\n\t" 153 "paddw %%mm1, %%mm3\n\t" 154 : : "m" (src[0])); 155 156 __asm__ volatile( 157 /* [mm2,mm3] += D * src[1..8] */ 158 "movq %1, %%mm1\n\t" 159 "movq %%mm1, %%mm0\n\t" 160 "movq %%mm1, %%mm4\n\t" 161 "punpcklbw %%mm7, %%mm0\n\t" 162 "punpckhbw %%mm7, %%mm4\n\t" 163 "pmullw %2, %%mm0\n\t" 164 "pmullw %2, %%mm4\n\t" 165 "paddw %%mm0, %%mm2\n\t" 166 "paddw %%mm4, %%mm3\n\t" 167 "movq %0, %%mm0\n\t" 168 : : "m" (src[0]), "m" (src[1]), "m" (DD)); 169 170 __asm__ volatile( 171 /* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */ 172 "paddw %1, %%mm2\n\t" 173 "paddw %1, %%mm3\n\t" 174 "psrlw $6, %%mm2\n\t" 175 "psrlw $6, %%mm3\n\t" 176 "packuswb %%mm3, %%mm2\n\t" 177 H264_CHROMA_OP(%0, %%mm2) 178 "movq %%mm2, %0\n\t" 179 : "=m" (dst[0]) : "m" (*rnd_reg)); 180 dst+= stride; 181 } 182} 183 184static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) 185{ 186 __asm__ volatile( 187 "pxor %%mm7, %%mm7 \n\t" 188 "movd %5, %%mm2 \n\t" 189 "movd %6, %%mm3 \n\t" 190 "movq "MANGLE(ff_pw_8)", %%mm4\n\t" 191 "movq "MANGLE(ff_pw_8)", %%mm5\n\t" 192 "punpcklwd %%mm2, %%mm2 \n\t" 193 "punpcklwd %%mm3, %%mm3 \n\t" 194 "punpcklwd %%mm2, %%mm2 \n\t" 195 "punpcklwd %%mm3, %%mm3 \n\t" 196 "psubw %%mm2, %%mm4 \n\t" 197 "psubw %%mm3, %%mm5 \n\t" 198 199 "movd (%1), %%mm0 \n\t" 200 "movd 1(%1), %%mm6 \n\t" 201 "add %3, %1 \n\t" 202 "punpcklbw %%mm7, %%mm0 \n\t" 203 "punpcklbw %%mm7, %%mm6 \n\t" 204 "pmullw %%mm4, %%mm0 \n\t" 205 "pmullw %%mm2, %%mm6 \n\t" 206 "paddw %%mm0, %%mm6 \n\t" 207 208 "1: \n\t" 209 "movd (%1), %%mm0 \n\t" 210 "movd 1(%1), %%mm1 \n\t" 211 "add %3, %1 \n\t" 212 "punpcklbw %%mm7, %%mm0 \n\t" 213 "punpcklbw %%mm7, %%mm1 \n\t" 214 "pmullw %%mm4, %%mm0 \n\t" 215 "pmullw %%mm2, %%mm1 \n\t" 216 "paddw %%mm0, %%mm1 \n\t" 217 "movq %%mm1, %%mm0 \n\t" 218 "pmullw %%mm5, %%mm6 \n\t" 219 "pmullw %%mm3, %%mm1 \n\t" 220 "paddw %4, %%mm6 \n\t" 221 "paddw %%mm6, %%mm1 \n\t" 222 "psrlw $6, %%mm1 \n\t" 223 "packuswb %%mm1, %%mm1 \n\t" 224 H264_CHROMA_OP4((%0), %%mm1, %%mm6) 225 "movd %%mm1, (%0) \n\t" 226 "add %3, %0 \n\t" 227 "movd (%1), %%mm6 \n\t" 228 "movd 1(%1), %%mm1 \n\t" 229 "add %3, %1 \n\t" 230 "punpcklbw %%mm7, %%mm6 \n\t" 231 "punpcklbw %%mm7, %%mm1 \n\t" 232 "pmullw %%mm4, %%mm6 \n\t" 233 "pmullw %%mm2, %%mm1 \n\t" 234 "paddw %%mm6, %%mm1 \n\t" 235 "movq %%mm1, %%mm6 \n\t" 236 "pmullw %%mm5, %%mm0 \n\t" 237 "pmullw %%mm3, %%mm1 \n\t" 238 "paddw %4, %%mm0 \n\t" 239 "paddw %%mm0, %%mm1 \n\t" 240 "psrlw $6, %%mm1 \n\t" 241 "packuswb %%mm1, %%mm1 \n\t" 242 H264_CHROMA_OP4((%0), %%mm1, %%mm0) 243 "movd %%mm1, (%0) \n\t" 244 "add %3, %0 \n\t" 245 "sub $2, %2 \n\t" 246 "jnz 1b \n\t" 247 : "+r"(dst), "+r"(src), "+r"(h) 248 : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y) 249 ); 250} 251 252#ifdef H264_CHROMA_MC2_TMPL 253static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 254{ 255 int tmp = ((1<<16)-1)*x + 8; 256 int CD= tmp*y; 257 int AB= (tmp<<3) - CD; 258 __asm__ volatile( 259 /* mm5 = {A,B,A,B} */ 260 /* mm6 = {C,D,C,D} */ 261 "movd %0, %%mm5\n\t" 262 "movd %1, %%mm6\n\t" 263 "punpckldq %%mm5, %%mm5\n\t" 264 "punpckldq %%mm6, %%mm6\n\t" 265 "pxor %%mm7, %%mm7\n\t" 266 /* mm0 = src[0,1,1,2] */ 267 "movd %2, %%mm2\n\t" 268 "punpcklbw %%mm7, %%mm2\n\t" 269 "pshufw $0x94, %%mm2, %%mm2\n\t" 270 :: "r"(AB), "r"(CD), "m"(src[0])); 271 272 273 __asm__ volatile( 274 "1:\n\t" 275 "add %4, %1\n\t" 276 /* mm1 = A * src[0,1] + B * src[1,2] */ 277 "movq %%mm2, %%mm1\n\t" 278 "pmaddwd %%mm5, %%mm1\n\t" 279 /* mm0 = src[0,1,1,2] */ 280 "movd (%1), %%mm0\n\t" 281 "punpcklbw %%mm7, %%mm0\n\t" 282 "pshufw $0x94, %%mm0, %%mm0\n\t" 283 /* mm1 += C * src[0,1] + D * src[1,2] */ 284 "movq %%mm0, %%mm2\n\t" 285 "pmaddwd %%mm6, %%mm0\n\t" 286 "paddw %3, %%mm1\n\t" 287 "paddw %%mm0, %%mm1\n\t" 288 /* dst[0,1] = pack((mm1 + 32) >> 6) */ 289 "psrlw $6, %%mm1\n\t" 290 "packssdw %%mm7, %%mm1\n\t" 291 "packuswb %%mm7, %%mm1\n\t" 292 H264_CHROMA_OP4((%0), %%mm1, %%mm3) 293 "movd %%mm1, %%esi\n\t" 294 "movw %%si, (%0)\n\t" 295 "add %4, %0\n\t" 296 "sub $1, %2\n\t" 297 "jnz 1b\n\t" 298 : "+r" (dst), "+r"(src), "+r"(h) 299 : "m" (ff_pw_32), "r"((x86_reg)stride) 300 : "%esi"); 301 302} 303#endif 304 305