1/* 2 * Chinese AVS video (AVS1-P2, JiZhun profile) decoder. 3 * Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de> 4 * 5 * MMX-optimized DSP functions, based on H.264 optimizations by 6 * Michael Niedermayer and Loren Merritt 7 * 8 * This file is part of FFmpeg. 9 * 10 * FFmpeg is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU Lesser General Public 12 * License as published by the Free Software Foundation; either 13 * version 2.1 of the License, or (at your option) any later version. 14 * 15 * FFmpeg is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public 21 * License along with FFmpeg; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23 */ 24 25#include "libavutil/common.h" 26#include "libavutil/x86_cpu.h" 27#include "libavcodec/dsputil.h" 28#include "dsputil_mmx.h" 29 30/***************************************************************************** 31 * 32 * inverse transform 33 * 34 ****************************************************************************/ 35 36static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) 37{ 38 __asm__ volatile( 39 "movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */ 40 "movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */ 41 "movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */ 42 "movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */ 43 "movq %%mm4, %%mm0 \n\t" 44 "movq %%mm5, %%mm3 \n\t" 45 "movq %%mm2, %%mm6 \n\t" 46 "movq %%mm7, %%mm1 \n\t" 47 48 "paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */ 49 "paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */ 50 "paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */ 51 "paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */ 52 "paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */ 53 "paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */ 54 "paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */ 55 "paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */ 56 "psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */ 57 "paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */ 58 "psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */ 59 "paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */ 60 61 "movq %%mm5, %%mm4 \n\t" 62 "movq %%mm7, %%mm6 \n\t" 63 "movq %%mm3, %%mm0 \n\t" 64 "movq %%mm1, %%mm2 \n\t" 65 SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */ 66 "paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */ 67 "paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */ 68 "paddw %%mm7, %%mm7 \n\t" 69 "paddw %%mm5, %%mm5 \n\t" 70 "paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */ 71 "paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */ 72 73 SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */ 74 "psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */ 75 "movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */ 76 "psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */ 77 "paddw %%mm1, %%mm1 \n\t" 78 "paddw %%mm3, %%mm3 \n\t" 79 "psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */ 80 "paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */ 81 82 "movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */ 83 "movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */ 84 "movq %%mm2, %%mm4 \n\t" 85 "movq %%mm6, %%mm0 \n\t" 86 "psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */ 87 "psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */ 88 "paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */ 89 "paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */ 90 "paddw %%mm2, %%mm2 \n\t" 91 "paddw %%mm0, %%mm0 \n\t" 92 "psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */ 93 "paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */ 94 95 "movq (%0), %%mm2 \n\t" /* mm2 = src0 */ 96 "movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */ 97 SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */ 98 "psllw $3, %%mm0 \n\t" 99 "psllw $3, %%mm2 \n\t" 100 "paddw %1, %%mm0 \n\t" /* add rounding bias */ 101 "paddw %1, %%mm2 \n\t" /* add rounding bias */ 102 103 SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */ 104 SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */ 105 SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */ 106 SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */ 107 SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */ 108 SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */ 109 :: "r"(block), "m"(bias) 110 ); 111} 112 113static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) 114{ 115 int i; 116 DECLARE_ALIGNED_8(int16_t, b2[64]); 117 118 for(i=0; i<2; i++){ 119 DECLARE_ALIGNED_8(uint64_t, tmp); 120 121 cavs_idct8_1d(block+4*i, ff_pw_4); 122 123 __asm__ volatile( 124 "psraw $3, %%mm7 \n\t" 125 "psraw $3, %%mm6 \n\t" 126 "psraw $3, %%mm5 \n\t" 127 "psraw $3, %%mm4 \n\t" 128 "psraw $3, %%mm3 \n\t" 129 "psraw $3, %%mm2 \n\t" 130 "psraw $3, %%mm1 \n\t" 131 "psraw $3, %%mm0 \n\t" 132 "movq %%mm7, %0 \n\t" 133 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) 134 "movq %%mm0, 8(%1) \n\t" 135 "movq %%mm6, 24(%1) \n\t" 136 "movq %%mm7, 40(%1) \n\t" 137 "movq %%mm4, 56(%1) \n\t" 138 "movq %0, %%mm7 \n\t" 139 TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) 140 "movq %%mm7, (%1) \n\t" 141 "movq %%mm1, 16(%1) \n\t" 142 "movq %%mm0, 32(%1) \n\t" 143 "movq %%mm3, 48(%1) \n\t" 144 : "=m"(tmp) 145 : "r"(b2+32*i) 146 : "memory" 147 ); 148 } 149 150 for(i=0; i<2; i++){ 151 cavs_idct8_1d(b2+4*i, ff_pw_64.a); 152 153 __asm__ volatile( 154 "psraw $7, %%mm7 \n\t" 155 "psraw $7, %%mm6 \n\t" 156 "psraw $7, %%mm5 \n\t" 157 "psraw $7, %%mm4 \n\t" 158 "psraw $7, %%mm3 \n\t" 159 "psraw $7, %%mm2 \n\t" 160 "psraw $7, %%mm1 \n\t" 161 "psraw $7, %%mm0 \n\t" 162 "movq %%mm7, (%0) \n\t" 163 "movq %%mm5, 16(%0) \n\t" 164 "movq %%mm3, 32(%0) \n\t" 165 "movq %%mm1, 48(%0) \n\t" 166 "movq %%mm0, 64(%0) \n\t" 167 "movq %%mm2, 80(%0) \n\t" 168 "movq %%mm4, 96(%0) \n\t" 169 "movq %%mm6, 112(%0) \n\t" 170 :: "r"(b2+4*i) 171 : "memory" 172 ); 173 } 174 175 add_pixels_clamped_mmx(b2, dst, stride); 176} 177 178/***************************************************************************** 179 * 180 * motion compensation 181 * 182 ****************************************************************************/ 183 184/* vertical filter [-1 -2 96 42 -7 0] */ 185#define QPEL_CAVSV1(A,B,C,D,E,F,OP) \ 186 "movd (%0), "#F" \n\t"\ 187 "movq "#C", %%mm6 \n\t"\ 188 "pmullw %5, %%mm6 \n\t"\ 189 "movq "#D", %%mm7 \n\t"\ 190 "pmullw %6, %%mm7 \n\t"\ 191 "psllw $3, "#E" \n\t"\ 192 "psubw "#E", %%mm6 \n\t"\ 193 "psraw $3, "#E" \n\t"\ 194 "paddw %%mm7, %%mm6 \n\t"\ 195 "paddw "#E", %%mm6 \n\t"\ 196 "paddw "#B", "#B" \n\t"\ 197 "pxor %%mm7, %%mm7 \n\t"\ 198 "add %2, %0 \n\t"\ 199 "punpcklbw %%mm7, "#F" \n\t"\ 200 "psubw "#B", %%mm6 \n\t"\ 201 "psraw $1, "#B" \n\t"\ 202 "psubw "#A", %%mm6 \n\t"\ 203 "paddw %4, %%mm6 \n\t"\ 204 "psraw $7, %%mm6 \n\t"\ 205 "packuswb %%mm6, %%mm6 \n\t"\ 206 OP(%%mm6, (%1), A, d) \ 207 "add %3, %1 \n\t" 208 209/* vertical filter [ 0 -1 5 5 -1 0] */ 210#define QPEL_CAVSV2(A,B,C,D,E,F,OP) \ 211 "movd (%0), "#F" \n\t"\ 212 "movq "#C", %%mm6 \n\t"\ 213 "paddw "#D", %%mm6 \n\t"\ 214 "pmullw %5, %%mm6 \n\t"\ 215 "add %2, %0 \n\t"\ 216 "punpcklbw %%mm7, "#F" \n\t"\ 217 "psubw "#B", %%mm6 \n\t"\ 218 "psubw "#E", %%mm6 \n\t"\ 219 "paddw %4, %%mm6 \n\t"\ 220 "psraw $3, %%mm6 \n\t"\ 221 "packuswb %%mm6, %%mm6 \n\t"\ 222 OP(%%mm6, (%1), A, d) \ 223 "add %3, %1 \n\t" 224 225/* vertical filter [ 0 -7 42 96 -2 -1] */ 226#define QPEL_CAVSV3(A,B,C,D,E,F,OP) \ 227 "movd (%0), "#F" \n\t"\ 228 "movq "#C", %%mm6 \n\t"\ 229 "pmullw %6, %%mm6 \n\t"\ 230 "movq "#D", %%mm7 \n\t"\ 231 "pmullw %5, %%mm7 \n\t"\ 232 "psllw $3, "#B" \n\t"\ 233 "psubw "#B", %%mm6 \n\t"\ 234 "psraw $3, "#B" \n\t"\ 235 "paddw %%mm7, %%mm6 \n\t"\ 236 "paddw "#B", %%mm6 \n\t"\ 237 "paddw "#E", "#E" \n\t"\ 238 "pxor %%mm7, %%mm7 \n\t"\ 239 "add %2, %0 \n\t"\ 240 "punpcklbw %%mm7, "#F" \n\t"\ 241 "psubw "#E", %%mm6 \n\t"\ 242 "psraw $1, "#E" \n\t"\ 243 "psubw "#F", %%mm6 \n\t"\ 244 "paddw %4, %%mm6 \n\t"\ 245 "psraw $7, %%mm6 \n\t"\ 246 "packuswb %%mm6, %%mm6 \n\t"\ 247 OP(%%mm6, (%1), A, d) \ 248 "add %3, %1 \n\t" 249 250 251#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\ 252 int w= 2;\ 253 src -= 2*srcStride;\ 254 \ 255 while(w--){\ 256 __asm__ volatile(\ 257 "pxor %%mm7, %%mm7 \n\t"\ 258 "movd (%0), %%mm0 \n\t"\ 259 "add %2, %0 \n\t"\ 260 "movd (%0), %%mm1 \n\t"\ 261 "add %2, %0 \n\t"\ 262 "movd (%0), %%mm2 \n\t"\ 263 "add %2, %0 \n\t"\ 264 "movd (%0), %%mm3 \n\t"\ 265 "add %2, %0 \n\t"\ 266 "movd (%0), %%mm4 \n\t"\ 267 "add %2, %0 \n\t"\ 268 "punpcklbw %%mm7, %%mm0 \n\t"\ 269 "punpcklbw %%mm7, %%mm1 \n\t"\ 270 "punpcklbw %%mm7, %%mm2 \n\t"\ 271 "punpcklbw %%mm7, %%mm3 \n\t"\ 272 "punpcklbw %%mm7, %%mm4 \n\t"\ 273 VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 274 VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 275 VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 276 VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 277 VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ 278 VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ 279 VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 280 VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 281 \ 282 : "+a"(src), "+c"(dst)\ 283 : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\ 284 : "memory"\ 285 );\ 286 if(h==16){\ 287 __asm__ volatile(\ 288 VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 289 VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 290 VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ 291 VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ 292 VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 293 VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 294 VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 295 VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 296 \ 297 : "+a"(src), "+c"(dst)\ 298 : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\ 299 : "memory"\ 300 );\ 301 }\ 302 src += 4-(h+5)*srcStride;\ 303 dst += 4-h*dstStride;\ 304 } 305 306#define QPEL_CAVS(OPNAME, OP, MMX)\ 307static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 308 int h=8;\ 309 __asm__ volatile(\ 310 "pxor %%mm7, %%mm7 \n\t"\ 311 "movq %5, %%mm6 \n\t"\ 312 "1: \n\t"\ 313 "movq (%0), %%mm0 \n\t"\ 314 "movq 1(%0), %%mm2 \n\t"\ 315 "movq %%mm0, %%mm1 \n\t"\ 316 "movq %%mm2, %%mm3 \n\t"\ 317 "punpcklbw %%mm7, %%mm0 \n\t"\ 318 "punpckhbw %%mm7, %%mm1 \n\t"\ 319 "punpcklbw %%mm7, %%mm2 \n\t"\ 320 "punpckhbw %%mm7, %%mm3 \n\t"\ 321 "paddw %%mm2, %%mm0 \n\t"\ 322 "paddw %%mm3, %%mm1 \n\t"\ 323 "pmullw %%mm6, %%mm0 \n\t"\ 324 "pmullw %%mm6, %%mm1 \n\t"\ 325 "movq -1(%0), %%mm2 \n\t"\ 326 "movq 2(%0), %%mm4 \n\t"\ 327 "movq %%mm2, %%mm3 \n\t"\ 328 "movq %%mm4, %%mm5 \n\t"\ 329 "punpcklbw %%mm7, %%mm2 \n\t"\ 330 "punpckhbw %%mm7, %%mm3 \n\t"\ 331 "punpcklbw %%mm7, %%mm4 \n\t"\ 332 "punpckhbw %%mm7, %%mm5 \n\t"\ 333 "paddw %%mm4, %%mm2 \n\t"\ 334 "paddw %%mm3, %%mm5 \n\t"\ 335 "psubw %%mm2, %%mm0 \n\t"\ 336 "psubw %%mm5, %%mm1 \n\t"\ 337 "movq %6, %%mm5 \n\t"\ 338 "paddw %%mm5, %%mm0 \n\t"\ 339 "paddw %%mm5, %%mm1 \n\t"\ 340 "psraw $3, %%mm0 \n\t"\ 341 "psraw $3, %%mm1 \n\t"\ 342 "packuswb %%mm1, %%mm0 \n\t"\ 343 OP(%%mm0, (%1),%%mm5, q) \ 344 "add %3, %0 \n\t"\ 345 "add %4, %1 \n\t"\ 346 "decl %2 \n\t"\ 347 " jnz 1b \n\t"\ 348 : "+a"(src), "+c"(dst), "+m"(h)\ 349 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\ 350 : "memory"\ 351 );\ 352}\ 353\ 354static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 355 QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ 356}\ 357\ 358static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 359 QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \ 360}\ 361\ 362static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 363 QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ 364}\ 365\ 366static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 367 OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\ 368}\ 369static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 370 OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\ 371 OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ 372}\ 373\ 374static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 375 OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\ 376}\ 377static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 378 OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\ 379 OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ 380}\ 381\ 382static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 383 OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\ 384}\ 385static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 386 OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\ 387 OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ 388}\ 389\ 390static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 391 OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ 392 OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 393 src += 8*srcStride;\ 394 dst += 8*dstStride;\ 395 OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ 396 OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 397}\ 398 399#define CAVS_MC(OPNAME, SIZE, MMX) \ 400static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 401 OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ 402}\ 403\ 404static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 405 OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ 406}\ 407\ 408static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 409 OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ 410}\ 411\ 412static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 413 OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ 414}\ 415 416#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" 417#define AVG_3DNOW_OP(a,b,temp, size) \ 418"mov" #size " " #b ", " #temp " \n\t"\ 419"pavgusb " #temp ", " #a " \n\t"\ 420"mov" #size " " #a ", " #b " \n\t" 421#define AVG_MMX2_OP(a,b,temp, size) \ 422"mov" #size " " #b ", " #temp " \n\t"\ 423"pavgb " #temp ", " #a " \n\t"\ 424"mov" #size " " #a ", " #b " \n\t" 425 426QPEL_CAVS(put_, PUT_OP, 3dnow) 427QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow) 428QPEL_CAVS(put_, PUT_OP, mmx2) 429QPEL_CAVS(avg_, AVG_MMX2_OP, mmx2) 430 431CAVS_MC(put_, 8, 3dnow) 432CAVS_MC(put_, 16,3dnow) 433CAVS_MC(avg_, 8, 3dnow) 434CAVS_MC(avg_, 16,3dnow) 435CAVS_MC(put_, 8, mmx2) 436CAVS_MC(put_, 16,mmx2) 437CAVS_MC(avg_, 8, mmx2) 438CAVS_MC(avg_, 16,mmx2) 439 440void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); 441void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); 442void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); 443void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); 444 445void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx) { 446#define dspfunc(PFX, IDX, NUM) \ 447 c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ 448 c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \ 449 c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \ 450 c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \ 451 c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \ 452 453 dspfunc(put_cavs_qpel, 0, 16); 454 dspfunc(put_cavs_qpel, 1, 8); 455 dspfunc(avg_cavs_qpel, 0, 16); 456 dspfunc(avg_cavs_qpel, 1, 8); 457#undef dspfunc 458 c->cavs_idct8_add = cavs_idct8_add_mmx; 459} 460 461void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx) { 462#define dspfunc(PFX, IDX, NUM) \ 463 c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ 464 c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \ 465 c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \ 466 c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \ 467 c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \ 468 469 dspfunc(put_cavs_qpel, 0, 16); 470 dspfunc(put_cavs_qpel, 1, 8); 471 dspfunc(avg_cavs_qpel, 0, 16); 472 dspfunc(avg_cavs_qpel, 1, 8); 473#undef dspfunc 474 c->cavs_idct8_add = cavs_idct8_add_mmx; 475} 476