1/* 2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "dsputil_mmx.h" 22 23DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; 24DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; 25 26/***********************************/ 27/* IDCT */ 28 29#define SUMSUB_BADC( a, b, c, d ) \ 30 "paddw "#b", "#a" \n\t"\ 31 "paddw "#d", "#c" \n\t"\ 32 "paddw "#b", "#b" \n\t"\ 33 "paddw "#d", "#d" \n\t"\ 34 "psubw "#a", "#b" \n\t"\ 35 "psubw "#c", "#d" \n\t" 36 37#define SUMSUBD2_AB( a, b, t ) \ 38 "movq "#b", "#t" \n\t"\ 39 "psraw $1 , "#b" \n\t"\ 40 "paddw "#a", "#b" \n\t"\ 41 "psraw $1 , "#a" \n\t"\ 42 "psubw "#t", "#a" \n\t" 43 44#define IDCT4_1D( s02, s13, d02, d13, t ) \ 45 SUMSUB_BA ( s02, d02 )\ 46 SUMSUBD2_AB( s13, d13, t )\ 47 SUMSUB_BADC( d13, s02, s13, d02 ) 48 49#define STORE_DIFF_4P( p, t, z ) \ 50 "psraw $6, "#p" \n\t"\ 51 "movd (%0), "#t" \n\t"\ 52 "punpcklbw "#z", "#t" \n\t"\ 53 "paddsw "#t", "#p" \n\t"\ 54 "packuswb "#z", "#p" \n\t"\ 55 "movd "#p", (%0) \n\t" 56 57static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) 58{ 59 /* Load dct coeffs */ 60 __asm__ volatile( 61 "movq (%0), %%mm0 \n\t" 62 "movq 8(%0), %%mm1 \n\t" 63 "movq 16(%0), %%mm2 \n\t" 64 "movq 24(%0), %%mm3 \n\t" 65 :: "r"(block) ); 66 67 __asm__ volatile( 68 /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ 69 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) 70 71 "movq %0, %%mm6 \n\t" 72 /* in: 1,4,0,2 out: 1,2,3,0 */ 73 TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 ) 74 75 "paddw %%mm6, %%mm3 \n\t" 76 77 /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ 78 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) 79 80 "pxor %%mm7, %%mm7 \n\t" 81 :: "m"(ff_pw_32)); 82 83 __asm__ volatile( 84 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) 85 "add %1, %0 \n\t" 86 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) 87 "add %1, %0 \n\t" 88 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) 89 "add %1, %0 \n\t" 90 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) 91 : "+r"(dst) 92 : "r" ((x86_reg)stride) 93 ); 94} 95 96static inline void h264_idct8_1d(int16_t *block) 97{ 98 __asm__ volatile( 99 "movq 112(%0), %%mm7 \n\t" 100 "movq 80(%0), %%mm0 \n\t" 101 "movq 48(%0), %%mm3 \n\t" 102 "movq 16(%0), %%mm5 \n\t" 103 104 "movq %%mm0, %%mm4 \n\t" 105 "movq %%mm5, %%mm1 \n\t" 106 "psraw $1, %%mm4 \n\t" 107 "psraw $1, %%mm1 \n\t" 108 "paddw %%mm0, %%mm4 \n\t" 109 "paddw %%mm5, %%mm1 \n\t" 110 "paddw %%mm7, %%mm4 \n\t" 111 "paddw %%mm0, %%mm1 \n\t" 112 "psubw %%mm5, %%mm4 \n\t" 113 "paddw %%mm3, %%mm1 \n\t" 114 115 "psubw %%mm3, %%mm5 \n\t" 116 "psubw %%mm3, %%mm0 \n\t" 117 "paddw %%mm7, %%mm5 \n\t" 118 "psubw %%mm7, %%mm0 \n\t" 119 "psraw $1, %%mm3 \n\t" 120 "psraw $1, %%mm7 \n\t" 121 "psubw %%mm3, %%mm5 \n\t" 122 "psubw %%mm7, %%mm0 \n\t" 123 124 "movq %%mm4, %%mm3 \n\t" 125 "movq %%mm1, %%mm7 \n\t" 126 "psraw $2, %%mm1 \n\t" 127 "psraw $2, %%mm3 \n\t" 128 "paddw %%mm5, %%mm3 \n\t" 129 "psraw $2, %%mm5 \n\t" 130 "paddw %%mm0, %%mm1 \n\t" 131 "psraw $2, %%mm0 \n\t" 132 "psubw %%mm4, %%mm5 \n\t" 133 "psubw %%mm0, %%mm7 \n\t" 134 135 "movq 32(%0), %%mm2 \n\t" 136 "movq 96(%0), %%mm6 \n\t" 137 "movq %%mm2, %%mm4 \n\t" 138 "movq %%mm6, %%mm0 \n\t" 139 "psraw $1, %%mm4 \n\t" 140 "psraw $1, %%mm6 \n\t" 141 "psubw %%mm0, %%mm4 \n\t" 142 "paddw %%mm2, %%mm6 \n\t" 143 144 "movq (%0), %%mm2 \n\t" 145 "movq 64(%0), %%mm0 \n\t" 146 SUMSUB_BA( %%mm0, %%mm2 ) 147 SUMSUB_BA( %%mm6, %%mm0 ) 148 SUMSUB_BA( %%mm4, %%mm2 ) 149 SUMSUB_BA( %%mm7, %%mm6 ) 150 SUMSUB_BA( %%mm5, %%mm4 ) 151 SUMSUB_BA( %%mm3, %%mm2 ) 152 SUMSUB_BA( %%mm1, %%mm0 ) 153 :: "r"(block) 154 ); 155} 156 157static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) 158{ 159 int i; 160 int16_t __attribute__ ((aligned(8))) b2[64]; 161 162 block[0] += 32; 163 164 for(i=0; i<2; i++){ 165 DECLARE_ALIGNED_8(uint64_t, tmp); 166 167 h264_idct8_1d(block+4*i); 168 169 __asm__ volatile( 170 "movq %%mm7, %0 \n\t" 171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) 172 "movq %%mm0, 8(%1) \n\t" 173 "movq %%mm6, 24(%1) \n\t" 174 "movq %%mm7, 40(%1) \n\t" 175 "movq %%mm4, 56(%1) \n\t" 176 "movq %0, %%mm7 \n\t" 177 TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) 178 "movq %%mm7, (%1) \n\t" 179 "movq %%mm1, 16(%1) \n\t" 180 "movq %%mm0, 32(%1) \n\t" 181 "movq %%mm3, 48(%1) \n\t" 182 : "=m"(tmp) 183 : "r"(b2+32*i) 184 : "memory" 185 ); 186 } 187 188 for(i=0; i<2; i++){ 189 h264_idct8_1d(b2+4*i); 190 191 __asm__ volatile( 192 "psraw $6, %%mm7 \n\t" 193 "psraw $6, %%mm6 \n\t" 194 "psraw $6, %%mm5 \n\t" 195 "psraw $6, %%mm4 \n\t" 196 "psraw $6, %%mm3 \n\t" 197 "psraw $6, %%mm2 \n\t" 198 "psraw $6, %%mm1 \n\t" 199 "psraw $6, %%mm0 \n\t" 200 201 "movq %%mm7, (%0) \n\t" 202 "movq %%mm5, 16(%0) \n\t" 203 "movq %%mm3, 32(%0) \n\t" 204 "movq %%mm1, 48(%0) \n\t" 205 "movq %%mm0, 64(%0) \n\t" 206 "movq %%mm2, 80(%0) \n\t" 207 "movq %%mm4, 96(%0) \n\t" 208 "movq %%mm6, 112(%0) \n\t" 209 :: "r"(b2+4*i) 210 : "memory" 211 ); 212 } 213 214 add_pixels_clamped_mmx(b2, dst, stride); 215} 216 217#define STORE_DIFF_8P( p, d, t, z )\ 218 "movq "#d", "#t" \n"\ 219 "psraw $6, "#p" \n"\ 220 "punpcklbw "#z", "#t" \n"\ 221 "paddsw "#t", "#p" \n"\ 222 "packuswb "#p", "#p" \n"\ 223 "movq "#p", "#d" \n" 224 225#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\ 226 "movdqa "#c", "#a" \n"\ 227 "movdqa "#g", "#e" \n"\ 228 "psraw $1, "#c" \n"\ 229 "psraw $1, "#g" \n"\ 230 "psubw "#e", "#c" \n"\ 231 "paddw "#a", "#g" \n"\ 232 "movdqa "#b", "#e" \n"\ 233 "psraw $1, "#e" \n"\ 234 "paddw "#b", "#e" \n"\ 235 "paddw "#d", "#e" \n"\ 236 "paddw "#f", "#e" \n"\ 237 "movdqa "#f", "#a" \n"\ 238 "psraw $1, "#a" \n"\ 239 "paddw "#f", "#a" \n"\ 240 "paddw "#h", "#a" \n"\ 241 "psubw "#b", "#a" \n"\ 242 "psubw "#d", "#b" \n"\ 243 "psubw "#d", "#f" \n"\ 244 "paddw "#h", "#b" \n"\ 245 "psubw "#h", "#f" \n"\ 246 "psraw $1, "#d" \n"\ 247 "psraw $1, "#h" \n"\ 248 "psubw "#d", "#b" \n"\ 249 "psubw "#h", "#f" \n"\ 250 "movdqa "#e", "#d" \n"\ 251 "movdqa "#a", "#h" \n"\ 252 "psraw $2, "#d" \n"\ 253 "psraw $2, "#h" \n"\ 254 "paddw "#f", "#d" \n"\ 255 "paddw "#b", "#h" \n"\ 256 "psraw $2, "#f" \n"\ 257 "psraw $2, "#b" \n"\ 258 "psubw "#f", "#e" \n"\ 259 "psubw "#a", "#b" \n"\ 260 "movdqa 0x00(%1), "#a" \n"\ 261 "movdqa 0x40(%1), "#f" \n"\ 262 SUMSUB_BA(f, a)\ 263 SUMSUB_BA(g, f)\ 264 SUMSUB_BA(c, a)\ 265 SUMSUB_BA(e, g)\ 266 SUMSUB_BA(b, c)\ 267 SUMSUB_BA(h, a)\ 268 SUMSUB_BA(d, f) 269 270static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) 271{ 272 __asm__ volatile( 273 "movdqa 0x10(%1), %%xmm1 \n" 274 "movdqa 0x20(%1), %%xmm2 \n" 275 "movdqa 0x30(%1), %%xmm3 \n" 276 "movdqa 0x50(%1), %%xmm5 \n" 277 "movdqa 0x60(%1), %%xmm6 \n" 278 "movdqa 0x70(%1), %%xmm7 \n" 279 H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) 280 TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1)) 281 "paddw %4, %%xmm4 \n" 282 "movdqa %%xmm4, 0x00(%1) \n" 283 "movdqa %%xmm2, 0x40(%1) \n" 284 H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1) 285 "movdqa %%xmm6, 0x60(%1) \n" 286 "movdqa %%xmm7, 0x70(%1) \n" 287 "pxor %%xmm7, %%xmm7 \n" 288 STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7) 289 STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7) 290 STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7) 291 STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7) 292 "lea (%0,%2,4), %0 \n" 293 STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7) 294 STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7) 295 "movdqa 0x60(%1), %%xmm0 \n" 296 "movdqa 0x70(%1), %%xmm1 \n" 297 STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7) 298 STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7) 299 :"+r"(dst) 300 :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32) 301 ); 302} 303 304static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) 305{ 306 int dc = (block[0] + 32) >> 6; 307 __asm__ volatile( 308 "movd %0, %%mm0 \n\t" 309 "pshufw $0, %%mm0, %%mm0 \n\t" 310 "pxor %%mm1, %%mm1 \n\t" 311 "psubw %%mm0, %%mm1 \n\t" 312 "packuswb %%mm0, %%mm0 \n\t" 313 "packuswb %%mm1, %%mm1 \n\t" 314 ::"r"(dc) 315 ); 316 __asm__ volatile( 317 "movd %0, %%mm2 \n\t" 318 "movd %1, %%mm3 \n\t" 319 "movd %2, %%mm4 \n\t" 320 "movd %3, %%mm5 \n\t" 321 "paddusb %%mm0, %%mm2 \n\t" 322 "paddusb %%mm0, %%mm3 \n\t" 323 "paddusb %%mm0, %%mm4 \n\t" 324 "paddusb %%mm0, %%mm5 \n\t" 325 "psubusb %%mm1, %%mm2 \n\t" 326 "psubusb %%mm1, %%mm3 \n\t" 327 "psubusb %%mm1, %%mm4 \n\t" 328 "psubusb %%mm1, %%mm5 \n\t" 329 "movd %%mm2, %0 \n\t" 330 "movd %%mm3, %1 \n\t" 331 "movd %%mm4, %2 \n\t" 332 "movd %%mm5, %3 \n\t" 333 :"+m"(*(uint32_t*)(dst+0*stride)), 334 "+m"(*(uint32_t*)(dst+1*stride)), 335 "+m"(*(uint32_t*)(dst+2*stride)), 336 "+m"(*(uint32_t*)(dst+3*stride)) 337 ); 338} 339 340static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) 341{ 342 int dc = (block[0] + 32) >> 6; 343 int y; 344 __asm__ volatile( 345 "movd %0, %%mm0 \n\t" 346 "pshufw $0, %%mm0, %%mm0 \n\t" 347 "pxor %%mm1, %%mm1 \n\t" 348 "psubw %%mm0, %%mm1 \n\t" 349 "packuswb %%mm0, %%mm0 \n\t" 350 "packuswb %%mm1, %%mm1 \n\t" 351 ::"r"(dc) 352 ); 353 for(y=2; y--; dst += 4*stride){ 354 __asm__ volatile( 355 "movq %0, %%mm2 \n\t" 356 "movq %1, %%mm3 \n\t" 357 "movq %2, %%mm4 \n\t" 358 "movq %3, %%mm5 \n\t" 359 "paddusb %%mm0, %%mm2 \n\t" 360 "paddusb %%mm0, %%mm3 \n\t" 361 "paddusb %%mm0, %%mm4 \n\t" 362 "paddusb %%mm0, %%mm5 \n\t" 363 "psubusb %%mm1, %%mm2 \n\t" 364 "psubusb %%mm1, %%mm3 \n\t" 365 "psubusb %%mm1, %%mm4 \n\t" 366 "psubusb %%mm1, %%mm5 \n\t" 367 "movq %%mm2, %0 \n\t" 368 "movq %%mm3, %1 \n\t" 369 "movq %%mm4, %2 \n\t" 370 "movq %%mm5, %3 \n\t" 371 :"+m"(*(uint64_t*)(dst+0*stride)), 372 "+m"(*(uint64_t*)(dst+1*stride)), 373 "+m"(*(uint64_t*)(dst+2*stride)), 374 "+m"(*(uint64_t*)(dst+3*stride)) 375 ); 376 } 377} 378 379//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split 380static const uint8_t scan8[16 + 2*4]={ 381 4+1*8, 5+1*8, 4+2*8, 5+2*8, 382 6+1*8, 7+1*8, 6+2*8, 7+2*8, 383 4+3*8, 5+3*8, 4+4*8, 5+4*8, 384 6+3*8, 7+3*8, 6+4*8, 7+4*8, 385 1+1*8, 2+1*8, 386 1+2*8, 2+2*8, 387 1+4*8, 2+4*8, 388 1+5*8, 2+5*8, 389}; 390 391static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 392 int i; 393 for(i=0; i<16; i++){ 394 if(nnzc[ scan8[i] ]) 395 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); 396 } 397} 398 399static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 400 int i; 401 for(i=0; i<16; i+=4){ 402 if(nnzc[ scan8[i] ]) 403 ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride); 404 } 405} 406 407 408static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 409 int i; 410 for(i=0; i<16; i++){ 411 int nnz = nnzc[ scan8[i] ]; 412 if(nnz){ 413 if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); 414 else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); 415 } 416 } 417} 418 419static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 420 int i; 421 for(i=0; i<16; i++){ 422 if(nnzc[ scan8[i] ] || block[i*16]) 423 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); 424 } 425} 426 427static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 428 int i; 429 for(i=0; i<16; i++){ 430 if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); 431 else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); 432 } 433} 434 435static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 436 int i; 437 for(i=0; i<16; i+=4){ 438 int nnz = nnzc[ scan8[i] ]; 439 if(nnz){ 440 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); 441 else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride); 442 } 443 } 444} 445 446static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 447 int i; 448 for(i=0; i<16; i+=4){ 449 int nnz = nnzc[ scan8[i] ]; 450 if(nnz){ 451 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); 452 else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride); 453 } 454 } 455} 456 457static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 458 int i; 459 for(i=16; i<16+8; i++){ 460 if(nnzc[ scan8[i] ] || block[i*16]) 461 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 462 } 463} 464 465static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 466 int i; 467 for(i=16; i<16+8; i++){ 468 if(nnzc[ scan8[i] ]) 469 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 470 else if(block[i*16]) 471 ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 472 } 473} 474 475#if CONFIG_GPL && HAVE_YASM 476static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride) 477{ 478 __asm__ volatile( 479 "movd %0, %%mm0 \n\t" // 0 0 X D 480 "punpcklwd %1, %%mm0 \n\t" // x X d D 481 "paddsw %2, %%mm0 \n\t" 482 "psraw $6, %%mm0 \n\t" 483 "punpcklwd %%mm0, %%mm0 \n\t" // d d D D 484 "pxor %%mm1, %%mm1 \n\t" // 0 0 0 0 485 "psubw %%mm0, %%mm1 \n\t" // -d-d-D-D 486 "packuswb %%mm1, %%mm0 \n\t" // -d-d-D-D d d D D 487 "pshufw $0xFA, %%mm0, %%mm1 \n\t" // -d-d-d-d-D-D-D-D 488 "punpcklwd %%mm0, %%mm0 \n\t" // d d d d D D D D 489 ::"m"(block[ 0]), 490 "m"(block[16]), 491 "m"(ff_pw_32) 492 ); 493 __asm__ volatile( 494 "movq %0, %%mm2 \n\t" 495 "movq %1, %%mm3 \n\t" 496 "movq %2, %%mm4 \n\t" 497 "movq %3, %%mm5 \n\t" 498 "paddusb %%mm0, %%mm2 \n\t" 499 "paddusb %%mm0, %%mm3 \n\t" 500 "paddusb %%mm0, %%mm4 \n\t" 501 "paddusb %%mm0, %%mm5 \n\t" 502 "psubusb %%mm1, %%mm2 \n\t" 503 "psubusb %%mm1, %%mm3 \n\t" 504 "psubusb %%mm1, %%mm4 \n\t" 505 "psubusb %%mm1, %%mm5 \n\t" 506 "movq %%mm2, %0 \n\t" 507 "movq %%mm3, %1 \n\t" 508 "movq %%mm4, %2 \n\t" 509 "movq %%mm5, %3 \n\t" 510 :"+m"(*(uint64_t*)(dst+0*stride)), 511 "+m"(*(uint64_t*)(dst+1*stride)), 512 "+m"(*(uint64_t*)(dst+2*stride)), 513 "+m"(*(uint64_t*)(dst+3*stride)) 514 ); 515} 516 517extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride); 518 519static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 520 int i; 521 for(i=0; i<16; i+=2) 522 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) 523 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride); 524} 525 526static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 527 int i; 528 for(i=0; i<16; i+=2){ 529 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) 530 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride); 531 else if(block[i*16]|block[i*16+16]) 532 ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride); 533 } 534} 535 536static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 537 int i; 538 for(i=16; i<16+8; i+=2){ 539 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) 540 ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 541 else if(block[i*16]|block[i*16+16]) 542 ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 543 } 544} 545#endif 546 547/***********************************/ 548/* deblocking */ 549 550// out: o = |x-y|>a 551// clobbers: t 552#define DIFF_GT_MMX(x,y,a,o,t)\ 553 "movq "#y", "#t" \n\t"\ 554 "movq "#x", "#o" \n\t"\ 555 "psubusb "#x", "#t" \n\t"\ 556 "psubusb "#y", "#o" \n\t"\ 557 "por "#t", "#o" \n\t"\ 558 "psubusb "#a", "#o" \n\t" 559 560// out: o = |x-y|>a 561// clobbers: t 562#define DIFF_GT2_MMX(x,y,a,o,t)\ 563 "movq "#y", "#t" \n\t"\ 564 "movq "#x", "#o" \n\t"\ 565 "psubusb "#x", "#t" \n\t"\ 566 "psubusb "#y", "#o" \n\t"\ 567 "psubusb "#a", "#t" \n\t"\ 568 "psubusb "#a", "#o" \n\t"\ 569 "pcmpeqb "#t", "#o" \n\t"\ 570 571// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 572// out: mm5=beta-1, mm7=mask 573// clobbers: mm4,mm6 574#define H264_DEBLOCK_MASK(alpha1, beta1) \ 575 "pshufw $0, "#alpha1", %%mm4 \n\t"\ 576 "pshufw $0, "#beta1 ", %%mm5 \n\t"\ 577 "packuswb %%mm4, %%mm4 \n\t"\ 578 "packuswb %%mm5, %%mm5 \n\t"\ 579 DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\ 580 DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\ 581 "por %%mm4, %%mm7 \n\t"\ 582 DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\ 583 "por %%mm4, %%mm7 \n\t"\ 584 "pxor %%mm6, %%mm6 \n\t"\ 585 "pcmpeqb %%mm6, %%mm7 \n\t" 586 587// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) 588// out: mm1=p0' mm2=q0' 589// clobbers: mm0,3-6 590#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\ 591 "movq %%mm1 , %%mm5 \n\t"\ 592 "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\ 593 "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\ 594 "pcmpeqb %%mm4 , %%mm4 \n\t"\ 595 "pxor %%mm4 , %%mm3 \n\t"\ 596 "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\ 597 "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\ 598 "pxor %%mm1 , %%mm4 \n\t"\ 599 "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\ 600 "pavgb %%mm5 , %%mm3 \n\t"\ 601 "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\ 602 "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\ 603 "psubusb %%mm3 , %%mm6 \n\t"\ 604 "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\ 605 "pminub %%mm7 , %%mm6 \n\t"\ 606 "pminub %%mm7 , %%mm3 \n\t"\ 607 "psubusb %%mm6 , %%mm1 \n\t"\ 608 "psubusb %%mm3 , %%mm2 \n\t"\ 609 "paddusb %%mm3 , %%mm1 \n\t"\ 610 "paddusb %%mm6 , %%mm2 \n\t" 611 612// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone 613// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) 614// clobbers: q2, tmp, tc0 615#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ 616 "movq %%mm1, "#tmp" \n\t"\ 617 "pavgb %%mm2, "#tmp" \n\t"\ 618 "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ 619 "pxor "q2addr", "#tmp" \n\t"\ 620 "pand %8, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ 621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ 622 "movq "#p1", "#tmp" \n\t"\ 623 "psubusb "#tc0", "#tmp" \n\t"\ 624 "paddusb "#p1", "#tc0" \n\t"\ 625 "pmaxub "#tmp", "#q2" \n\t"\ 626 "pminub "#tc0", "#q2" \n\t"\ 627 "movq "#q2", "q1addr" \n\t" 628 629static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) 630{ 631 DECLARE_ALIGNED_8(uint64_t, tmp0[2]); 632 633 __asm__ volatile( 634 "movq (%1,%3), %%mm0 \n\t" //p1 635 "movq (%1,%3,2), %%mm1 \n\t" //p0 636 "movq (%2), %%mm2 \n\t" //q0 637 "movq (%2,%3), %%mm3 \n\t" //q1 638 H264_DEBLOCK_MASK(%6, %7) 639 640 "movd %5, %%mm4 \n\t" 641 "punpcklbw %%mm4, %%mm4 \n\t" 642 "punpcklwd %%mm4, %%mm4 \n\t" 643 "pcmpeqb %%mm3, %%mm3 \n\t" 644 "movq %%mm4, %%mm6 \n\t" 645 "pcmpgtb %%mm3, %%mm4 \n\t" 646 "movq %%mm6, 8+%0 \n\t" 647 "pand %%mm4, %%mm7 \n\t" 648 "movq %%mm7, %0 \n\t" 649 650 /* filter p1 */ 651 "movq (%1), %%mm3 \n\t" //p2 652 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 653 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta 654 "pand 8+%0, %%mm7 \n\t" // mask & tc0 655 "movq %%mm7, %%mm4 \n\t" 656 "psubb %%mm6, %%mm7 \n\t" 657 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 658 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4) 659 660 /* filter q1 */ 661 "movq (%2,%3,2), %%mm4 \n\t" //q2 662 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 663 "pand %0, %%mm6 \n\t" 664 "movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then 665 "pand %%mm6, %%mm5 \n\t" 666 "psubb %%mm6, %%mm7 \n\t" 667 "movq (%2,%3), %%mm3 \n\t" 668 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6) 669 670 /* filter p0, q0 */ 671 H264_DEBLOCK_P0_Q0(%8, unused) 672 "movq %%mm1, (%1,%3,2) \n\t" 673 "movq %%mm2, (%2) \n\t" 674 675 : "=m"(*tmp0) 676 : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride), 677 "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), 678 "m"(ff_bone) 679 ); 680} 681 682static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 683{ 684 if((tc0[0] & tc0[1]) >= 0) 685 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); 686 if((tc0[2] & tc0[3]) >= 0) 687 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); 688} 689static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 690{ 691 //FIXME: could cut some load/stores by merging transpose with filter 692 // also, it only needs to transpose 6x8 693 DECLARE_ALIGNED_8(uint8_t, trans[8*8]); 694 int i; 695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { 696 if((tc0[0] & tc0[1]) < 0) 697 continue; 698 transpose4x4(trans, pix-4, 8, stride); 699 transpose4x4(trans +4*8, pix, 8, stride); 700 transpose4x4(trans+4, pix-4+4*stride, 8, stride); 701 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); 702 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); 703 transpose4x4(pix-2, trans +2*8, stride, 8); 704 transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8); 705 } 706} 707 708static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) 709{ 710 __asm__ volatile( 711 "movq (%0), %%mm0 \n\t" //p1 712 "movq (%0,%2), %%mm1 \n\t" //p0 713 "movq (%1), %%mm2 \n\t" //q0 714 "movq (%1,%2), %%mm3 \n\t" //q1 715 H264_DEBLOCK_MASK(%4, %5) 716 "movd %3, %%mm6 \n\t" 717 "punpcklbw %%mm6, %%mm6 \n\t" 718 "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask 719 H264_DEBLOCK_P0_Q0(%6, %7) 720 "movq %%mm1, (%0,%2) \n\t" 721 "movq %%mm2, (%1) \n\t" 722 723 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), 724 "r"(*(uint32_t*)tc0), 725 "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F) 726 ); 727} 728 729static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 730{ 731 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); 732} 733 734static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 735{ 736 //FIXME: could cut some load/stores by merging transpose with filter 737 DECLARE_ALIGNED_8(uint8_t, trans[8*4]); 738 transpose4x4(trans, pix-2, 8, stride); 739 transpose4x4(trans+4, pix-2+4*stride, 8, stride); 740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); 741 transpose4x4(pix-2, trans, stride, 8); 742 transpose4x4(pix-2+4*stride, trans+4, stride, 8); 743} 744 745// p0 = (p0 + q1 + 2*p1 + 2) >> 2 746#define H264_FILTER_CHROMA4(p0, p1, q1, one) \ 747 "movq "#p0", %%mm4 \n\t"\ 748 "pxor "#q1", %%mm4 \n\t"\ 749 "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\ 750 "pavgb "#q1", "#p0" \n\t"\ 751 "psubusb %%mm4, "#p0" \n\t"\ 752 "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ 753 754static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) 755{ 756 __asm__ volatile( 757 "movq (%0), %%mm0 \n\t" 758 "movq (%0,%2), %%mm1 \n\t" 759 "movq (%1), %%mm2 \n\t" 760 "movq (%1,%2), %%mm3 \n\t" 761 H264_DEBLOCK_MASK(%3, %4) 762 "movq %%mm1, %%mm5 \n\t" 763 "movq %%mm2, %%mm6 \n\t" 764 H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0' 765 H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0' 766 "psubb %%mm5, %%mm1 \n\t" 767 "psubb %%mm6, %%mm2 \n\t" 768 "pand %%mm7, %%mm1 \n\t" 769 "pand %%mm7, %%mm2 \n\t" 770 "paddb %%mm5, %%mm1 \n\t" 771 "paddb %%mm6, %%mm2 \n\t" 772 "movq %%mm1, (%0,%2) \n\t" 773 "movq %%mm2, (%1) \n\t" 774 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), 775 "m"(alpha1), "m"(beta1), "m"(ff_bone) 776 ); 777} 778 779static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) 780{ 781 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); 782} 783 784static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) 785{ 786 //FIXME: could cut some load/stores by merging transpose with filter 787 DECLARE_ALIGNED_8(uint8_t, trans[8*4]); 788 transpose4x4(trans, pix-2, 8, stride); 789 transpose4x4(trans+4, pix-2+4*stride, 8, stride); 790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); 791 transpose4x4(pix-2, trans, stride, 8); 792 transpose4x4(pix-2+4*stride, trans+4, stride, 8); 793} 794 795static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], 796 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { 797 int dir; 798 __asm__ volatile( 799 "pxor %%mm7, %%mm7 \n\t" 800 "movq %0, %%mm6 \n\t" 801 "movq %1, %%mm5 \n\t" 802 "movq %2, %%mm4 \n\t" 803 ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7) 804 ); 805 if(field) 806 __asm__ volatile( 807 "movq %0, %%mm5 \n\t" 808 "movq %1, %%mm4 \n\t" 809 ::"m"(ff_pb_3_1), "m"(ff_pb_7_3) 810 ); 811 812 // could do a special case for dir==0 && edges==1, but it only reduces the 813 // average filter time by 1.2% 814 for( dir=1; dir>=0; dir-- ) { 815 const int d_idx = dir ? -8 : -1; 816 const int mask_mv = dir ? mask_mv1 : mask_mv0; 817 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; 818 int b_idx, edge, l; 819 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { 820 __asm__ volatile( 821 "pand %0, %%mm0 \n\t" 822 ::"m"(mask_dir) 823 ); 824 if(!(mask_mv & edge)) { 825 __asm__ volatile("pxor %%mm0, %%mm0 \n\t":); 826 for( l = bidir; l >= 0; l-- ) { 827 __asm__ volatile( 828 "movd %0, %%mm1 \n\t" 829 "punpckldq %1, %%mm1 \n\t" 830 "movq %%mm1, %%mm2 \n\t" 831 "psrlw $7, %%mm2 \n\t" 832 "pand %%mm6, %%mm2 \n\t" 833 "por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1 834 "punpckldq %%mm1, %%mm2 \n\t" 835 "pcmpeqb %%mm2, %%mm1 \n\t" 836 "paddb %%mm6, %%mm1 \n\t" 837 "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] 838 "por %%mm1, %%mm0 \n\t" 839 840 "movq %2, %%mm1 \n\t" 841 "movq %3, %%mm2 \n\t" 842 "psubw %4, %%mm1 \n\t" 843 "psubw %5, %%mm2 \n\t" 844 "packsswb %%mm2, %%mm1 \n\t" 845 "paddb %%mm5, %%mm1 \n\t" 846 "pminub %%mm4, %%mm1 \n\t" 847 "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit 848 "por %%mm1, %%mm0 \n\t" 849 ::"m"(ref[l][b_idx]), 850 "m"(ref[l][b_idx+d_idx]), 851 "m"(mv[l][b_idx][0]), 852 "m"(mv[l][b_idx+2][0]), 853 "m"(mv[l][b_idx+d_idx][0]), 854 "m"(mv[l][b_idx+d_idx+2][0]) 855 ); 856 } 857 } 858 __asm__ volatile( 859 "movd %0, %%mm1 \n\t" 860 "por %1, %%mm1 \n\t" 861 "punpcklbw %%mm7, %%mm1 \n\t" 862 "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn] 863 ::"m"(nnz[b_idx]), 864 "m"(nnz[b_idx+d_idx]) 865 ); 866 __asm__ volatile( 867 "pcmpeqw %%mm7, %%mm0 \n\t" 868 "pcmpeqw %%mm7, %%mm0 \n\t" 869 "psrlw $15, %%mm0 \n\t" // nonzero -> 1 870 "psrlw $14, %%mm1 \n\t" 871 "movq %%mm0, %%mm2 \n\t" 872 "por %%mm1, %%mm2 \n\t" 873 "psrlw $1, %%mm1 \n\t" 874 "pandn %%mm2, %%mm1 \n\t" 875 "movq %%mm1, %0 \n\t" 876 :"=m"(*bS[dir][edge]) 877 ::"memory" 878 ); 879 } 880 edges = 4; 881 step = 1; 882 } 883 __asm__ volatile( 884 "movq (%0), %%mm0 \n\t" 885 "movq 8(%0), %%mm1 \n\t" 886 "movq 16(%0), %%mm2 \n\t" 887 "movq 24(%0), %%mm3 \n\t" 888 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) 889 "movq %%mm0, (%0) \n\t" 890 "movq %%mm3, 8(%0) \n\t" 891 "movq %%mm4, 16(%0) \n\t" 892 "movq %%mm2, 24(%0) \n\t" 893 ::"r"(bS[0]) 894 :"memory" 895 ); 896} 897 898/***********************************/ 899/* motion compensation */ 900 901#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\ 902 "mov"#q" "#C", "#T" \n\t"\ 903 "mov"#d" (%0), "#F" \n\t"\ 904 "paddw "#D", "#T" \n\t"\ 905 "psllw $2, "#T" \n\t"\ 906 "psubw "#B", "#T" \n\t"\ 907 "psubw "#E", "#T" \n\t"\ 908 "punpcklbw "#Z", "#F" \n\t"\ 909 "pmullw %4, "#T" \n\t"\ 910 "paddw %5, "#A" \n\t"\ 911 "add %2, %0 \n\t"\ 912 "paddw "#F", "#A" \n\t"\ 913 "paddw "#A", "#T" \n\t"\ 914 "psraw $5, "#T" \n\t"\ 915 "packuswb "#T", "#T" \n\t"\ 916 OP(T, (%1), A, d)\ 917 "add %3, %1 \n\t" 918 919#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\ 920 "mov"#q" "#C", "#T" \n\t"\ 921 "mov"#d" (%0), "#F" \n\t"\ 922 "paddw "#D", "#T" \n\t"\ 923 "psllw $2, "#T" \n\t"\ 924 "paddw %4, "#A" \n\t"\ 925 "psubw "#B", "#T" \n\t"\ 926 "psubw "#E", "#T" \n\t"\ 927 "punpcklbw "#Z", "#F" \n\t"\ 928 "pmullw %3, "#T" \n\t"\ 929 "paddw "#F", "#A" \n\t"\ 930 "add %2, %0 \n\t"\ 931 "paddw "#A", "#T" \n\t"\ 932 "mov"#q" "#T", "#OF"(%1) \n\t" 933 934#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q) 935#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q) 936#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa) 937#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa) 938 939 940#define QPEL_H264(OPNAME, OP, MMX)\ 941static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 942 int h=4;\ 943\ 944 __asm__ volatile(\ 945 "pxor %%mm7, %%mm7 \n\t"\ 946 "movq %5, %%mm4 \n\t"\ 947 "movq %6, %%mm5 \n\t"\ 948 "1: \n\t"\ 949 "movd -1(%0), %%mm1 \n\t"\ 950 "movd (%0), %%mm2 \n\t"\ 951 "movd 1(%0), %%mm3 \n\t"\ 952 "movd 2(%0), %%mm0 \n\t"\ 953 "punpcklbw %%mm7, %%mm1 \n\t"\ 954 "punpcklbw %%mm7, %%mm2 \n\t"\ 955 "punpcklbw %%mm7, %%mm3 \n\t"\ 956 "punpcklbw %%mm7, %%mm0 \n\t"\ 957 "paddw %%mm0, %%mm1 \n\t"\ 958 "paddw %%mm3, %%mm2 \n\t"\ 959 "movd -2(%0), %%mm0 \n\t"\ 960 "movd 3(%0), %%mm3 \n\t"\ 961 "punpcklbw %%mm7, %%mm0 \n\t"\ 962 "punpcklbw %%mm7, %%mm3 \n\t"\ 963 "paddw %%mm3, %%mm0 \n\t"\ 964 "psllw $2, %%mm2 \n\t"\ 965 "psubw %%mm1, %%mm2 \n\t"\ 966 "pmullw %%mm4, %%mm2 \n\t"\ 967 "paddw %%mm5, %%mm0 \n\t"\ 968 "paddw %%mm2, %%mm0 \n\t"\ 969 "psraw $5, %%mm0 \n\t"\ 970 "packuswb %%mm0, %%mm0 \n\t"\ 971 OP(%%mm0, (%1),%%mm6, d)\ 972 "add %3, %0 \n\t"\ 973 "add %4, %1 \n\t"\ 974 "decl %2 \n\t"\ 975 " jnz 1b \n\t"\ 976 : "+a"(src), "+c"(dst), "+g"(h)\ 977 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 978 : "memory"\ 979 );\ 980}\ 981static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 982 int h=4;\ 983 __asm__ volatile(\ 984 "pxor %%mm7, %%mm7 \n\t"\ 985 "movq %0, %%mm4 \n\t"\ 986 "movq %1, %%mm5 \n\t"\ 987 :: "m"(ff_pw_5), "m"(ff_pw_16)\ 988 );\ 989 do{\ 990 __asm__ volatile(\ 991 "movd -1(%0), %%mm1 \n\t"\ 992 "movd (%0), %%mm2 \n\t"\ 993 "movd 1(%0), %%mm3 \n\t"\ 994 "movd 2(%0), %%mm0 \n\t"\ 995 "punpcklbw %%mm7, %%mm1 \n\t"\ 996 "punpcklbw %%mm7, %%mm2 \n\t"\ 997 "punpcklbw %%mm7, %%mm3 \n\t"\ 998 "punpcklbw %%mm7, %%mm0 \n\t"\ 999 "paddw %%mm0, %%mm1 \n\t"\ 1000 "paddw %%mm3, %%mm2 \n\t"\ 1001 "movd -2(%0), %%mm0 \n\t"\ 1002 "movd 3(%0), %%mm3 \n\t"\ 1003 "punpcklbw %%mm7, %%mm0 \n\t"\ 1004 "punpcklbw %%mm7, %%mm3 \n\t"\ 1005 "paddw %%mm3, %%mm0 \n\t"\ 1006 "psllw $2, %%mm2 \n\t"\ 1007 "psubw %%mm1, %%mm2 \n\t"\ 1008 "pmullw %%mm4, %%mm2 \n\t"\ 1009 "paddw %%mm5, %%mm0 \n\t"\ 1010 "paddw %%mm2, %%mm0 \n\t"\ 1011 "movd (%2), %%mm3 \n\t"\ 1012 "psraw $5, %%mm0 \n\t"\ 1013 "packuswb %%mm0, %%mm0 \n\t"\ 1014 PAVGB" %%mm3, %%mm0 \n\t"\ 1015 OP(%%mm0, (%1),%%mm6, d)\ 1016 "add %4, %0 \n\t"\ 1017 "add %4, %1 \n\t"\ 1018 "add %3, %2 \n\t"\ 1019 : "+a"(src), "+c"(dst), "+d"(src2)\ 1020 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ 1021 : "memory"\ 1022 );\ 1023 }while(--h);\ 1024}\ 1025static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1026 src -= 2*srcStride;\ 1027 __asm__ volatile(\ 1028 "pxor %%mm7, %%mm7 \n\t"\ 1029 "movd (%0), %%mm0 \n\t"\ 1030 "add %2, %0 \n\t"\ 1031 "movd (%0), %%mm1 \n\t"\ 1032 "add %2, %0 \n\t"\ 1033 "movd (%0), %%mm2 \n\t"\ 1034 "add %2, %0 \n\t"\ 1035 "movd (%0), %%mm3 \n\t"\ 1036 "add %2, %0 \n\t"\ 1037 "movd (%0), %%mm4 \n\t"\ 1038 "add %2, %0 \n\t"\ 1039 "punpcklbw %%mm7, %%mm0 \n\t"\ 1040 "punpcklbw %%mm7, %%mm1 \n\t"\ 1041 "punpcklbw %%mm7, %%mm2 \n\t"\ 1042 "punpcklbw %%mm7, %%mm3 \n\t"\ 1043 "punpcklbw %%mm7, %%mm4 \n\t"\ 1044 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 1045 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 1046 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 1047 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 1048 \ 1049 : "+a"(src), "+c"(dst)\ 1050 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1051 : "memory"\ 1052 );\ 1053}\ 1054static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 1055 int h=4;\ 1056 int w=3;\ 1057 src -= 2*srcStride+2;\ 1058 while(w--){\ 1059 __asm__ volatile(\ 1060 "pxor %%mm7, %%mm7 \n\t"\ 1061 "movd (%0), %%mm0 \n\t"\ 1062 "add %2, %0 \n\t"\ 1063 "movd (%0), %%mm1 \n\t"\ 1064 "add %2, %0 \n\t"\ 1065 "movd (%0), %%mm2 \n\t"\ 1066 "add %2, %0 \n\t"\ 1067 "movd (%0), %%mm3 \n\t"\ 1068 "add %2, %0 \n\t"\ 1069 "movd (%0), %%mm4 \n\t"\ 1070 "add %2, %0 \n\t"\ 1071 "punpcklbw %%mm7, %%mm0 \n\t"\ 1072 "punpcklbw %%mm7, %%mm1 \n\t"\ 1073 "punpcklbw %%mm7, %%mm2 \n\t"\ 1074 "punpcklbw %%mm7, %%mm3 \n\t"\ 1075 "punpcklbw %%mm7, %%mm4 \n\t"\ 1076 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ 1077 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ 1078 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ 1079 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ 1080 \ 1081 : "+a"(src)\ 1082 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1083 : "memory"\ 1084 );\ 1085 tmp += 4;\ 1086 src += 4 - 9*srcStride;\ 1087 }\ 1088 tmp -= 3*4;\ 1089 __asm__ volatile(\ 1090 "1: \n\t"\ 1091 "movq (%0), %%mm0 \n\t"\ 1092 "paddw 10(%0), %%mm0 \n\t"\ 1093 "movq 2(%0), %%mm1 \n\t"\ 1094 "paddw 8(%0), %%mm1 \n\t"\ 1095 "movq 4(%0), %%mm2 \n\t"\ 1096 "paddw 6(%0), %%mm2 \n\t"\ 1097 "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ 1098 "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ 1099 "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ 1100 "paddsw %%mm2, %%mm0 \n\t"\ 1101 "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\ 1102 "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\ 1103 "psraw $6, %%mm0 \n\t"\ 1104 "packuswb %%mm0, %%mm0 \n\t"\ 1105 OP(%%mm0, (%1),%%mm7, d)\ 1106 "add $24, %0 \n\t"\ 1107 "add %3, %1 \n\t"\ 1108 "decl %2 \n\t"\ 1109 " jnz 1b \n\t"\ 1110 : "+a"(tmp), "+c"(dst), "+g"(h)\ 1111 : "S"((x86_reg)dstStride)\ 1112 : "memory"\ 1113 );\ 1114}\ 1115\ 1116static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1117 int h=8;\ 1118 __asm__ volatile(\ 1119 "pxor %%mm7, %%mm7 \n\t"\ 1120 "movq %5, %%mm6 \n\t"\ 1121 "1: \n\t"\ 1122 "movq (%0), %%mm0 \n\t"\ 1123 "movq 1(%0), %%mm2 \n\t"\ 1124 "movq %%mm0, %%mm1 \n\t"\ 1125 "movq %%mm2, %%mm3 \n\t"\ 1126 "punpcklbw %%mm7, %%mm0 \n\t"\ 1127 "punpckhbw %%mm7, %%mm1 \n\t"\ 1128 "punpcklbw %%mm7, %%mm2 \n\t"\ 1129 "punpckhbw %%mm7, %%mm3 \n\t"\ 1130 "paddw %%mm2, %%mm0 \n\t"\ 1131 "paddw %%mm3, %%mm1 \n\t"\ 1132 "psllw $2, %%mm0 \n\t"\ 1133 "psllw $2, %%mm1 \n\t"\ 1134 "movq -1(%0), %%mm2 \n\t"\ 1135 "movq 2(%0), %%mm4 \n\t"\ 1136 "movq %%mm2, %%mm3 \n\t"\ 1137 "movq %%mm4, %%mm5 \n\t"\ 1138 "punpcklbw %%mm7, %%mm2 \n\t"\ 1139 "punpckhbw %%mm7, %%mm3 \n\t"\ 1140 "punpcklbw %%mm7, %%mm4 \n\t"\ 1141 "punpckhbw %%mm7, %%mm5 \n\t"\ 1142 "paddw %%mm4, %%mm2 \n\t"\ 1143 "paddw %%mm3, %%mm5 \n\t"\ 1144 "psubw %%mm2, %%mm0 \n\t"\ 1145 "psubw %%mm5, %%mm1 \n\t"\ 1146 "pmullw %%mm6, %%mm0 \n\t"\ 1147 "pmullw %%mm6, %%mm1 \n\t"\ 1148 "movd -2(%0), %%mm2 \n\t"\ 1149 "movd 7(%0), %%mm5 \n\t"\ 1150 "punpcklbw %%mm7, %%mm2 \n\t"\ 1151 "punpcklbw %%mm7, %%mm5 \n\t"\ 1152 "paddw %%mm3, %%mm2 \n\t"\ 1153 "paddw %%mm5, %%mm4 \n\t"\ 1154 "movq %6, %%mm5 \n\t"\ 1155 "paddw %%mm5, %%mm2 \n\t"\ 1156 "paddw %%mm5, %%mm4 \n\t"\ 1157 "paddw %%mm2, %%mm0 \n\t"\ 1158 "paddw %%mm4, %%mm1 \n\t"\ 1159 "psraw $5, %%mm0 \n\t"\ 1160 "psraw $5, %%mm1 \n\t"\ 1161 "packuswb %%mm1, %%mm0 \n\t"\ 1162 OP(%%mm0, (%1),%%mm5, q)\ 1163 "add %3, %0 \n\t"\ 1164 "add %4, %1 \n\t"\ 1165 "decl %2 \n\t"\ 1166 " jnz 1b \n\t"\ 1167 : "+a"(src), "+c"(dst), "+g"(h)\ 1168 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1169 : "memory"\ 1170 );\ 1171}\ 1172\ 1173static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1174 int h=8;\ 1175 __asm__ volatile(\ 1176 "pxor %%mm7, %%mm7 \n\t"\ 1177 "movq %0, %%mm6 \n\t"\ 1178 :: "m"(ff_pw_5)\ 1179 );\ 1180 do{\ 1181 __asm__ volatile(\ 1182 "movq (%0), %%mm0 \n\t"\ 1183 "movq 1(%0), %%mm2 \n\t"\ 1184 "movq %%mm0, %%mm1 \n\t"\ 1185 "movq %%mm2, %%mm3 \n\t"\ 1186 "punpcklbw %%mm7, %%mm0 \n\t"\ 1187 "punpckhbw %%mm7, %%mm1 \n\t"\ 1188 "punpcklbw %%mm7, %%mm2 \n\t"\ 1189 "punpckhbw %%mm7, %%mm3 \n\t"\ 1190 "paddw %%mm2, %%mm0 \n\t"\ 1191 "paddw %%mm3, %%mm1 \n\t"\ 1192 "psllw $2, %%mm0 \n\t"\ 1193 "psllw $2, %%mm1 \n\t"\ 1194 "movq -1(%0), %%mm2 \n\t"\ 1195 "movq 2(%0), %%mm4 \n\t"\ 1196 "movq %%mm2, %%mm3 \n\t"\ 1197 "movq %%mm4, %%mm5 \n\t"\ 1198 "punpcklbw %%mm7, %%mm2 \n\t"\ 1199 "punpckhbw %%mm7, %%mm3 \n\t"\ 1200 "punpcklbw %%mm7, %%mm4 \n\t"\ 1201 "punpckhbw %%mm7, %%mm5 \n\t"\ 1202 "paddw %%mm4, %%mm2 \n\t"\ 1203 "paddw %%mm3, %%mm5 \n\t"\ 1204 "psubw %%mm2, %%mm0 \n\t"\ 1205 "psubw %%mm5, %%mm1 \n\t"\ 1206 "pmullw %%mm6, %%mm0 \n\t"\ 1207 "pmullw %%mm6, %%mm1 \n\t"\ 1208 "movd -2(%0), %%mm2 \n\t"\ 1209 "movd 7(%0), %%mm5 \n\t"\ 1210 "punpcklbw %%mm7, %%mm2 \n\t"\ 1211 "punpcklbw %%mm7, %%mm5 \n\t"\ 1212 "paddw %%mm3, %%mm2 \n\t"\ 1213 "paddw %%mm5, %%mm4 \n\t"\ 1214 "movq %5, %%mm5 \n\t"\ 1215 "paddw %%mm5, %%mm2 \n\t"\ 1216 "paddw %%mm5, %%mm4 \n\t"\ 1217 "paddw %%mm2, %%mm0 \n\t"\ 1218 "paddw %%mm4, %%mm1 \n\t"\ 1219 "psraw $5, %%mm0 \n\t"\ 1220 "psraw $5, %%mm1 \n\t"\ 1221 "movq (%2), %%mm4 \n\t"\ 1222 "packuswb %%mm1, %%mm0 \n\t"\ 1223 PAVGB" %%mm4, %%mm0 \n\t"\ 1224 OP(%%mm0, (%1),%%mm5, q)\ 1225 "add %4, %0 \n\t"\ 1226 "add %4, %1 \n\t"\ 1227 "add %3, %2 \n\t"\ 1228 : "+a"(src), "+c"(dst), "+d"(src2)\ 1229 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ 1230 "m"(ff_pw_16)\ 1231 : "memory"\ 1232 );\ 1233 }while(--h);\ 1234}\ 1235\ 1236static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1237 int w= 2;\ 1238 src -= 2*srcStride;\ 1239 \ 1240 while(w--){\ 1241 __asm__ volatile(\ 1242 "pxor %%mm7, %%mm7 \n\t"\ 1243 "movd (%0), %%mm0 \n\t"\ 1244 "add %2, %0 \n\t"\ 1245 "movd (%0), %%mm1 \n\t"\ 1246 "add %2, %0 \n\t"\ 1247 "movd (%0), %%mm2 \n\t"\ 1248 "add %2, %0 \n\t"\ 1249 "movd (%0), %%mm3 \n\t"\ 1250 "add %2, %0 \n\t"\ 1251 "movd (%0), %%mm4 \n\t"\ 1252 "add %2, %0 \n\t"\ 1253 "punpcklbw %%mm7, %%mm0 \n\t"\ 1254 "punpcklbw %%mm7, %%mm1 \n\t"\ 1255 "punpcklbw %%mm7, %%mm2 \n\t"\ 1256 "punpcklbw %%mm7, %%mm3 \n\t"\ 1257 "punpcklbw %%mm7, %%mm4 \n\t"\ 1258 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 1259 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 1260 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 1261 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 1262 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ 1263 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ 1264 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 1265 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 1266 \ 1267 : "+a"(src), "+c"(dst)\ 1268 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1269 : "memory"\ 1270 );\ 1271 if(h==16){\ 1272 __asm__ volatile(\ 1273 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 1274 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 1275 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ 1276 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ 1277 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 1278 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 1279 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 1280 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 1281 \ 1282 : "+a"(src), "+c"(dst)\ 1283 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1284 : "memory"\ 1285 );\ 1286 }\ 1287 src += 4-(h+5)*srcStride;\ 1288 dst += 4-h*dstStride;\ 1289 }\ 1290}\ 1291static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ 1292 int w = (size+8)>>2;\ 1293 src -= 2*srcStride+2;\ 1294 while(w--){\ 1295 __asm__ volatile(\ 1296 "pxor %%mm7, %%mm7 \n\t"\ 1297 "movd (%0), %%mm0 \n\t"\ 1298 "add %2, %0 \n\t"\ 1299 "movd (%0), %%mm1 \n\t"\ 1300 "add %2, %0 \n\t"\ 1301 "movd (%0), %%mm2 \n\t"\ 1302 "add %2, %0 \n\t"\ 1303 "movd (%0), %%mm3 \n\t"\ 1304 "add %2, %0 \n\t"\ 1305 "movd (%0), %%mm4 \n\t"\ 1306 "add %2, %0 \n\t"\ 1307 "punpcklbw %%mm7, %%mm0 \n\t"\ 1308 "punpcklbw %%mm7, %%mm1 \n\t"\ 1309 "punpcklbw %%mm7, %%mm2 \n\t"\ 1310 "punpcklbw %%mm7, %%mm3 \n\t"\ 1311 "punpcklbw %%mm7, %%mm4 \n\t"\ 1312 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\ 1313 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\ 1314 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\ 1315 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\ 1316 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\ 1317 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\ 1318 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\ 1319 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\ 1320 : "+a"(src)\ 1321 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1322 : "memory"\ 1323 );\ 1324 if(size==16){\ 1325 __asm__ volatile(\ 1326 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ 1327 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ 1328 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ 1329 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ 1330 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ 1331 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\ 1332 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\ 1333 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\ 1334 : "+a"(src)\ 1335 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1336 : "memory"\ 1337 );\ 1338 }\ 1339 tmp += 4;\ 1340 src += 4 - (size+5)*srcStride;\ 1341 }\ 1342}\ 1343static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ 1344 int w = size>>4;\ 1345 do{\ 1346 int h = size;\ 1347 __asm__ volatile(\ 1348 "1: \n\t"\ 1349 "movq (%0), %%mm0 \n\t"\ 1350 "movq 8(%0), %%mm3 \n\t"\ 1351 "movq 2(%0), %%mm1 \n\t"\ 1352 "movq 10(%0), %%mm4 \n\t"\ 1353 "paddw %%mm4, %%mm0 \n\t"\ 1354 "paddw %%mm3, %%mm1 \n\t"\ 1355 "paddw 18(%0), %%mm3 \n\t"\ 1356 "paddw 16(%0), %%mm4 \n\t"\ 1357 "movq 4(%0), %%mm2 \n\t"\ 1358 "movq 12(%0), %%mm5 \n\t"\ 1359 "paddw 6(%0), %%mm2 \n\t"\ 1360 "paddw 14(%0), %%mm5 \n\t"\ 1361 "psubw %%mm1, %%mm0 \n\t"\ 1362 "psubw %%mm4, %%mm3 \n\t"\ 1363 "psraw $2, %%mm0 \n\t"\ 1364 "psraw $2, %%mm3 \n\t"\ 1365 "psubw %%mm1, %%mm0 \n\t"\ 1366 "psubw %%mm4, %%mm3 \n\t"\ 1367 "paddsw %%mm2, %%mm0 \n\t"\ 1368 "paddsw %%mm5, %%mm3 \n\t"\ 1369 "psraw $2, %%mm0 \n\t"\ 1370 "psraw $2, %%mm3 \n\t"\ 1371 "paddw %%mm2, %%mm0 \n\t"\ 1372 "paddw %%mm5, %%mm3 \n\t"\ 1373 "psraw $6, %%mm0 \n\t"\ 1374 "psraw $6, %%mm3 \n\t"\ 1375 "packuswb %%mm3, %%mm0 \n\t"\ 1376 OP(%%mm0, (%1),%%mm7, q)\ 1377 "add $48, %0 \n\t"\ 1378 "add %3, %1 \n\t"\ 1379 "decl %2 \n\t"\ 1380 " jnz 1b \n\t"\ 1381 : "+a"(tmp), "+c"(dst), "+g"(h)\ 1382 : "S"((x86_reg)dstStride)\ 1383 : "memory"\ 1384 );\ 1385 tmp += 8 - size*24;\ 1386 dst += 8 - size*dstStride;\ 1387 }while(w--);\ 1388}\ 1389\ 1390static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1391 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ 1392}\ 1393static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1394 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ 1395 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ 1396}\ 1397\ 1398static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1399 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 1400 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 1401 src += 8*srcStride;\ 1402 dst += 8*dstStride;\ 1403 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 1404 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 1405}\ 1406\ 1407static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1408 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 1409 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 1410 src += 8*dstStride;\ 1411 dst += 8*dstStride;\ 1412 src2 += 8*src2Stride;\ 1413 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 1414 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 1415}\ 1416\ 1417static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ 1418 put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ 1419 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ 1420}\ 1421static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 1422 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ 1423}\ 1424\ 1425static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 1426 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ 1427}\ 1428\ 1429static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ 1430{\ 1431 __asm__ volatile(\ 1432 "movq (%1), %%mm0 \n\t"\ 1433 "movq 24(%1), %%mm1 \n\t"\ 1434 "psraw $5, %%mm0 \n\t"\ 1435 "psraw $5, %%mm1 \n\t"\ 1436 "packuswb %%mm0, %%mm0 \n\t"\ 1437 "packuswb %%mm1, %%mm1 \n\t"\ 1438 PAVGB" (%0), %%mm0 \n\t"\ 1439 PAVGB" (%0,%3), %%mm1 \n\t"\ 1440 OP(%%mm0, (%2), %%mm4, d)\ 1441 OP(%%mm1, (%2,%4), %%mm5, d)\ 1442 "lea (%0,%3,2), %0 \n\t"\ 1443 "lea (%2,%4,2), %2 \n\t"\ 1444 "movq 48(%1), %%mm0 \n\t"\ 1445 "movq 72(%1), %%mm1 \n\t"\ 1446 "psraw $5, %%mm0 \n\t"\ 1447 "psraw $5, %%mm1 \n\t"\ 1448 "packuswb %%mm0, %%mm0 \n\t"\ 1449 "packuswb %%mm1, %%mm1 \n\t"\ 1450 PAVGB" (%0), %%mm0 \n\t"\ 1451 PAVGB" (%0,%3), %%mm1 \n\t"\ 1452 OP(%%mm0, (%2), %%mm4, d)\ 1453 OP(%%mm1, (%2,%4), %%mm5, d)\ 1454 :"+a"(src8), "+c"(src16), "+d"(dst)\ 1455 :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\ 1456 :"memory");\ 1457}\ 1458static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ 1459{\ 1460 do{\ 1461 __asm__ volatile(\ 1462 "movq (%1), %%mm0 \n\t"\ 1463 "movq 8(%1), %%mm1 \n\t"\ 1464 "movq 48(%1), %%mm2 \n\t"\ 1465 "movq 8+48(%1), %%mm3 \n\t"\ 1466 "psraw $5, %%mm0 \n\t"\ 1467 "psraw $5, %%mm1 \n\t"\ 1468 "psraw $5, %%mm2 \n\t"\ 1469 "psraw $5, %%mm3 \n\t"\ 1470 "packuswb %%mm1, %%mm0 \n\t"\ 1471 "packuswb %%mm3, %%mm2 \n\t"\ 1472 PAVGB" (%0), %%mm0 \n\t"\ 1473 PAVGB" (%0,%3), %%mm2 \n\t"\ 1474 OP(%%mm0, (%2), %%mm5, q)\ 1475 OP(%%mm2, (%2,%4), %%mm5, q)\ 1476 ::"a"(src8), "c"(src16), "d"(dst),\ 1477 "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\ 1478 :"memory");\ 1479 src8 += 2L*src8Stride;\ 1480 src16 += 48;\ 1481 dst += 2L*dstStride;\ 1482 }while(h-=2);\ 1483}\ 1484static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ 1485{\ 1486 OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ 1487 OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ 1488}\ 1489 1490 1491#if ARCH_X86_64 1492#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 1493static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1494 int h=16;\ 1495 __asm__ volatile(\ 1496 "pxor %%xmm15, %%xmm15 \n\t"\ 1497 "movdqa %6, %%xmm14 \n\t"\ 1498 "movdqa %7, %%xmm13 \n\t"\ 1499 "1: \n\t"\ 1500 "lddqu 3(%0), %%xmm1 \n\t"\ 1501 "lddqu -5(%0), %%xmm7 \n\t"\ 1502 "movdqa %%xmm1, %%xmm0 \n\t"\ 1503 "punpckhbw %%xmm15, %%xmm1 \n\t"\ 1504 "punpcklbw %%xmm15, %%xmm0 \n\t"\ 1505 "punpcklbw %%xmm15, %%xmm7 \n\t"\ 1506 "movdqa %%xmm1, %%xmm2 \n\t"\ 1507 "movdqa %%xmm0, %%xmm6 \n\t"\ 1508 "movdqa %%xmm1, %%xmm3 \n\t"\ 1509 "movdqa %%xmm0, %%xmm8 \n\t"\ 1510 "movdqa %%xmm1, %%xmm4 \n\t"\ 1511 "movdqa %%xmm0, %%xmm9 \n\t"\ 1512 "movdqa %%xmm1, %%xmm5 \n\t"\ 1513 "movdqa %%xmm0, %%xmm10 \n\t"\ 1514 "palignr $6, %%xmm0, %%xmm5 \n\t"\ 1515 "palignr $6, %%xmm7, %%xmm10\n\t"\ 1516 "palignr $8, %%xmm0, %%xmm4 \n\t"\ 1517 "palignr $8, %%xmm7, %%xmm9 \n\t"\ 1518 "palignr $10,%%xmm0, %%xmm3 \n\t"\ 1519 "palignr $10,%%xmm7, %%xmm8 \n\t"\ 1520 "paddw %%xmm1, %%xmm5 \n\t"\ 1521 "paddw %%xmm0, %%xmm10 \n\t"\ 1522 "palignr $12,%%xmm0, %%xmm2 \n\t"\ 1523 "palignr $12,%%xmm7, %%xmm6 \n\t"\ 1524 "palignr $14,%%xmm0, %%xmm1 \n\t"\ 1525 "palignr $14,%%xmm7, %%xmm0 \n\t"\ 1526 "paddw %%xmm3, %%xmm2 \n\t"\ 1527 "paddw %%xmm8, %%xmm6 \n\t"\ 1528 "paddw %%xmm4, %%xmm1 \n\t"\ 1529 "paddw %%xmm9, %%xmm0 \n\t"\ 1530 "psllw $2, %%xmm2 \n\t"\ 1531 "psllw $2, %%xmm6 \n\t"\ 1532 "psubw %%xmm1, %%xmm2 \n\t"\ 1533 "psubw %%xmm0, %%xmm6 \n\t"\ 1534 "paddw %%xmm13,%%xmm5 \n\t"\ 1535 "paddw %%xmm13,%%xmm10 \n\t"\ 1536 "pmullw %%xmm14,%%xmm2 \n\t"\ 1537 "pmullw %%xmm14,%%xmm6 \n\t"\ 1538 "lddqu (%2), %%xmm3 \n\t"\ 1539 "paddw %%xmm5, %%xmm2 \n\t"\ 1540 "paddw %%xmm10,%%xmm6 \n\t"\ 1541 "psraw $5, %%xmm2 \n\t"\ 1542 "psraw $5, %%xmm6 \n\t"\ 1543 "packuswb %%xmm2,%%xmm6 \n\t"\ 1544 "pavgb %%xmm3, %%xmm6 \n\t"\ 1545 OP(%%xmm6, (%1), %%xmm4, dqa)\ 1546 "add %5, %0 \n\t"\ 1547 "add %5, %1 \n\t"\ 1548 "add %4, %2 \n\t"\ 1549 "decl %3 \n\t"\ 1550 "jg 1b \n\t"\ 1551 : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ 1552 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ 1553 "m"(ff_pw_5), "m"(ff_pw_16)\ 1554 : "memory"\ 1555 );\ 1556} 1557#else // ARCH_X86_64 1558#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 1559static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1560 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 1561 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 1562 src += 8*dstStride;\ 1563 dst += 8*dstStride;\ 1564 src2 += 8*src2Stride;\ 1565 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 1566 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 1567} 1568#endif // ARCH_X86_64 1569 1570#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ 1571static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1572 int h=8;\ 1573 __asm__ volatile(\ 1574 "pxor %%xmm7, %%xmm7 \n\t"\ 1575 "movdqa %0, %%xmm6 \n\t"\ 1576 :: "m"(ff_pw_5)\ 1577 );\ 1578 do{\ 1579 __asm__ volatile(\ 1580 "lddqu -5(%0), %%xmm1 \n\t"\ 1581 "movdqa %%xmm1, %%xmm0 \n\t"\ 1582 "punpckhbw %%xmm7, %%xmm1 \n\t"\ 1583 "punpcklbw %%xmm7, %%xmm0 \n\t"\ 1584 "movdqa %%xmm1, %%xmm2 \n\t"\ 1585 "movdqa %%xmm1, %%xmm3 \n\t"\ 1586 "movdqa %%xmm1, %%xmm4 \n\t"\ 1587 "movdqa %%xmm1, %%xmm5 \n\t"\ 1588 "palignr $6, %%xmm0, %%xmm5 \n\t"\ 1589 "palignr $8, %%xmm0, %%xmm4 \n\t"\ 1590 "palignr $10,%%xmm0, %%xmm3 \n\t"\ 1591 "paddw %%xmm1, %%xmm5 \n\t"\ 1592 "palignr $12,%%xmm0, %%xmm2 \n\t"\ 1593 "palignr $14,%%xmm0, %%xmm1 \n\t"\ 1594 "paddw %%xmm3, %%xmm2 \n\t"\ 1595 "paddw %%xmm4, %%xmm1 \n\t"\ 1596 "psllw $2, %%xmm2 \n\t"\ 1597 "movq (%2), %%xmm3 \n\t"\ 1598 "psubw %%xmm1, %%xmm2 \n\t"\ 1599 "paddw %5, %%xmm5 \n\t"\ 1600 "pmullw %%xmm6, %%xmm2 \n\t"\ 1601 "paddw %%xmm5, %%xmm2 \n\t"\ 1602 "psraw $5, %%xmm2 \n\t"\ 1603 "packuswb %%xmm2, %%xmm2 \n\t"\ 1604 "pavgb %%xmm3, %%xmm2 \n\t"\ 1605 OP(%%xmm2, (%1), %%xmm4, q)\ 1606 "add %4, %0 \n\t"\ 1607 "add %4, %1 \n\t"\ 1608 "add %3, %2 \n\t"\ 1609 : "+a"(src), "+c"(dst), "+d"(src2)\ 1610 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ 1611 "m"(ff_pw_16)\ 1612 : "memory"\ 1613 );\ 1614 }while(--h);\ 1615}\ 1616QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 1617\ 1618static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1619 int h=8;\ 1620 __asm__ volatile(\ 1621 "pxor %%xmm7, %%xmm7 \n\t"\ 1622 "movdqa %5, %%xmm6 \n\t"\ 1623 "1: \n\t"\ 1624 "lddqu -5(%0), %%xmm1 \n\t"\ 1625 "movdqa %%xmm1, %%xmm0 \n\t"\ 1626 "punpckhbw %%xmm7, %%xmm1 \n\t"\ 1627 "punpcklbw %%xmm7, %%xmm0 \n\t"\ 1628 "movdqa %%xmm1, %%xmm2 \n\t"\ 1629 "movdqa %%xmm1, %%xmm3 \n\t"\ 1630 "movdqa %%xmm1, %%xmm4 \n\t"\ 1631 "movdqa %%xmm1, %%xmm5 \n\t"\ 1632 "palignr $6, %%xmm0, %%xmm5 \n\t"\ 1633 "palignr $8, %%xmm0, %%xmm4 \n\t"\ 1634 "palignr $10,%%xmm0, %%xmm3 \n\t"\ 1635 "paddw %%xmm1, %%xmm5 \n\t"\ 1636 "palignr $12,%%xmm0, %%xmm2 \n\t"\ 1637 "palignr $14,%%xmm0, %%xmm1 \n\t"\ 1638 "paddw %%xmm3, %%xmm2 \n\t"\ 1639 "paddw %%xmm4, %%xmm1 \n\t"\ 1640 "psllw $2, %%xmm2 \n\t"\ 1641 "psubw %%xmm1, %%xmm2 \n\t"\ 1642 "paddw %6, %%xmm5 \n\t"\ 1643 "pmullw %%xmm6, %%xmm2 \n\t"\ 1644 "paddw %%xmm5, %%xmm2 \n\t"\ 1645 "psraw $5, %%xmm2 \n\t"\ 1646 "packuswb %%xmm2, %%xmm2 \n\t"\ 1647 OP(%%xmm2, (%1), %%xmm4, q)\ 1648 "add %3, %0 \n\t"\ 1649 "add %4, %1 \n\t"\ 1650 "decl %2 \n\t"\ 1651 " jnz 1b \n\t"\ 1652 : "+a"(src), "+c"(dst), "+g"(h)\ 1653 : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\ 1654 "m"(ff_pw_5), "m"(ff_pw_16)\ 1655 : "memory"\ 1656 );\ 1657}\ 1658static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1659 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 1660 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 1661 src += 8*srcStride;\ 1662 dst += 8*dstStride;\ 1663 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 1664 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 1665}\ 1666 1667#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ 1668static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1669 src -= 2*srcStride;\ 1670 \ 1671 __asm__ volatile(\ 1672 "pxor %%xmm7, %%xmm7 \n\t"\ 1673 "movq (%0), %%xmm0 \n\t"\ 1674 "add %2, %0 \n\t"\ 1675 "movq (%0), %%xmm1 \n\t"\ 1676 "add %2, %0 \n\t"\ 1677 "movq (%0), %%xmm2 \n\t"\ 1678 "add %2, %0 \n\t"\ 1679 "movq (%0), %%xmm3 \n\t"\ 1680 "add %2, %0 \n\t"\ 1681 "movq (%0), %%xmm4 \n\t"\ 1682 "add %2, %0 \n\t"\ 1683 "punpcklbw %%xmm7, %%xmm0 \n\t"\ 1684 "punpcklbw %%xmm7, %%xmm1 \n\t"\ 1685 "punpcklbw %%xmm7, %%xmm2 \n\t"\ 1686 "punpcklbw %%xmm7, %%xmm3 \n\t"\ 1687 "punpcklbw %%xmm7, %%xmm4 \n\t"\ 1688 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ 1689 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ 1690 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ 1691 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ 1692 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ 1693 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ 1694 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ 1695 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ 1696 \ 1697 : "+a"(src), "+c"(dst)\ 1698 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1699 : "memory"\ 1700 );\ 1701 if(h==16){\ 1702 __asm__ volatile(\ 1703 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ 1704 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ 1705 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ 1706 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ 1707 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ 1708 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ 1709 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ 1710 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ 1711 \ 1712 : "+a"(src), "+c"(dst)\ 1713 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1714 : "memory"\ 1715 );\ 1716 }\ 1717}\ 1718static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1719 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ 1720}\ 1721static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1722 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ 1723 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ 1724} 1725 1726static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ 1727 int w = (size+8)>>3; 1728 src -= 2*srcStride+2; 1729 while(w--){ 1730 __asm__ volatile( 1731 "pxor %%xmm7, %%xmm7 \n\t" 1732 "movq (%0), %%xmm0 \n\t" 1733 "add %2, %0 \n\t" 1734 "movq (%0), %%xmm1 \n\t" 1735 "add %2, %0 \n\t" 1736 "movq (%0), %%xmm2 \n\t" 1737 "add %2, %0 \n\t" 1738 "movq (%0), %%xmm3 \n\t" 1739 "add %2, %0 \n\t" 1740 "movq (%0), %%xmm4 \n\t" 1741 "add %2, %0 \n\t" 1742 "punpcklbw %%xmm7, %%xmm0 \n\t" 1743 "punpcklbw %%xmm7, %%xmm1 \n\t" 1744 "punpcklbw %%xmm7, %%xmm2 \n\t" 1745 "punpcklbw %%xmm7, %%xmm3 \n\t" 1746 "punpcklbw %%xmm7, %%xmm4 \n\t" 1747 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) 1748 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) 1749 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) 1750 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) 1751 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) 1752 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) 1753 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) 1754 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) 1755 : "+a"(src) 1756 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) 1757 : "memory" 1758 ); 1759 if(size==16){ 1760 __asm__ volatile( 1761 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) 1762 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) 1763 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) 1764 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) 1765 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) 1766 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) 1767 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) 1768 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) 1769 : "+a"(src) 1770 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) 1771 : "memory" 1772 ); 1773 } 1774 tmp += 8; 1775 src += 8 - (size+5)*srcStride; 1776 } 1777} 1778 1779#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ 1780static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ 1781 int h = size;\ 1782 if(size == 16){\ 1783 __asm__ volatile(\ 1784 "1: \n\t"\ 1785 "movdqa 32(%0), %%xmm4 \n\t"\ 1786 "movdqa 16(%0), %%xmm5 \n\t"\ 1787 "movdqa (%0), %%xmm7 \n\t"\ 1788 "movdqa %%xmm4, %%xmm3 \n\t"\ 1789 "movdqa %%xmm4, %%xmm2 \n\t"\ 1790 "movdqa %%xmm4, %%xmm1 \n\t"\ 1791 "movdqa %%xmm4, %%xmm0 \n\t"\ 1792 "palignr $10, %%xmm5, %%xmm0 \n\t"\ 1793 "palignr $8, %%xmm5, %%xmm1 \n\t"\ 1794 "palignr $6, %%xmm5, %%xmm2 \n\t"\ 1795 "palignr $4, %%xmm5, %%xmm3 \n\t"\ 1796 "palignr $2, %%xmm5, %%xmm4 \n\t"\ 1797 "paddw %%xmm5, %%xmm0 \n\t"\ 1798 "paddw %%xmm4, %%xmm1 \n\t"\ 1799 "paddw %%xmm3, %%xmm2 \n\t"\ 1800 "movdqa %%xmm5, %%xmm6 \n\t"\ 1801 "movdqa %%xmm5, %%xmm4 \n\t"\ 1802 "movdqa %%xmm5, %%xmm3 \n\t"\ 1803 "palignr $8, %%xmm7, %%xmm4 \n\t"\ 1804 "palignr $2, %%xmm7, %%xmm6 \n\t"\ 1805 "palignr $10, %%xmm7, %%xmm3 \n\t"\ 1806 "paddw %%xmm6, %%xmm4 \n\t"\ 1807 "movdqa %%xmm5, %%xmm6 \n\t"\ 1808 "palignr $6, %%xmm7, %%xmm5 \n\t"\ 1809 "palignr $4, %%xmm7, %%xmm6 \n\t"\ 1810 "paddw %%xmm7, %%xmm3 \n\t"\ 1811 "paddw %%xmm6, %%xmm5 \n\t"\ 1812 \ 1813 "psubw %%xmm1, %%xmm0 \n\t"\ 1814 "psubw %%xmm4, %%xmm3 \n\t"\ 1815 "psraw $2, %%xmm0 \n\t"\ 1816 "psraw $2, %%xmm3 \n\t"\ 1817 "psubw %%xmm1, %%xmm0 \n\t"\ 1818 "psubw %%xmm4, %%xmm3 \n\t"\ 1819 "paddw %%xmm2, %%xmm0 \n\t"\ 1820 "paddw %%xmm5, %%xmm3 \n\t"\ 1821 "psraw $2, %%xmm0 \n\t"\ 1822 "psraw $2, %%xmm3 \n\t"\ 1823 "paddw %%xmm2, %%xmm0 \n\t"\ 1824 "paddw %%xmm5, %%xmm3 \n\t"\ 1825 "psraw $6, %%xmm0 \n\t"\ 1826 "psraw $6, %%xmm3 \n\t"\ 1827 "packuswb %%xmm0, %%xmm3 \n\t"\ 1828 OP(%%xmm3, (%1), %%xmm7, dqa)\ 1829 "add $48, %0 \n\t"\ 1830 "add %3, %1 \n\t"\ 1831 "decl %2 \n\t"\ 1832 " jnz 1b \n\t"\ 1833 : "+a"(tmp), "+c"(dst), "+g"(h)\ 1834 : "S"((x86_reg)dstStride)\ 1835 : "memory"\ 1836 );\ 1837 }else{\ 1838 __asm__ volatile(\ 1839 "1: \n\t"\ 1840 "movdqa 16(%0), %%xmm1 \n\t"\ 1841 "movdqa (%0), %%xmm0 \n\t"\ 1842 "movdqa %%xmm1, %%xmm2 \n\t"\ 1843 "movdqa %%xmm1, %%xmm3 \n\t"\ 1844 "movdqa %%xmm1, %%xmm4 \n\t"\ 1845 "movdqa %%xmm1, %%xmm5 \n\t"\ 1846 "palignr $10, %%xmm0, %%xmm5 \n\t"\ 1847 "palignr $8, %%xmm0, %%xmm4 \n\t"\ 1848 "palignr $6, %%xmm0, %%xmm3 \n\t"\ 1849 "palignr $4, %%xmm0, %%xmm2 \n\t"\ 1850 "palignr $2, %%xmm0, %%xmm1 \n\t"\ 1851 "paddw %%xmm5, %%xmm0 \n\t"\ 1852 "paddw %%xmm4, %%xmm1 \n\t"\ 1853 "paddw %%xmm3, %%xmm2 \n\t"\ 1854 "psubw %%xmm1, %%xmm0 \n\t"\ 1855 "psraw $2, %%xmm0 \n\t"\ 1856 "psubw %%xmm1, %%xmm0 \n\t"\ 1857 "paddw %%xmm2, %%xmm0 \n\t"\ 1858 "psraw $2, %%xmm0 \n\t"\ 1859 "paddw %%xmm2, %%xmm0 \n\t"\ 1860 "psraw $6, %%xmm0 \n\t"\ 1861 "packuswb %%xmm0, %%xmm0 \n\t"\ 1862 OP(%%xmm0, (%1), %%xmm7, q)\ 1863 "add $48, %0 \n\t"\ 1864 "add %3, %1 \n\t"\ 1865 "decl %2 \n\t"\ 1866 " jnz 1b \n\t"\ 1867 : "+a"(tmp), "+c"(dst), "+g"(h)\ 1868 : "S"((x86_reg)dstStride)\ 1869 : "memory"\ 1870 );\ 1871 }\ 1872} 1873 1874#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ 1875static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ 1876 put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ 1877 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ 1878}\ 1879static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 1880 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ 1881}\ 1882static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 1883 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ 1884}\ 1885 1886#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2 1887#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2 1888#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2 1889#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2 1890#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2 1891#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2 1892#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2 1893#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2 1894 1895#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2 1896#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2 1897#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2 1898#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2 1899#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2 1900#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2 1901#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2 1902#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2 1903 1904#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2 1905#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2 1906#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2 1907#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2 1908 1909#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2 1910#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2 1911#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2 1912#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2 1913 1914#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2 1915#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2 1916 1917#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ 1918H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ 1919H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ 1920H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ 1921H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ 1922 1923static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ 1924 put_pixels16_sse2(dst, src, stride, 16); 1925} 1926static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ 1927 avg_pixels16_sse2(dst, src, stride, 16); 1928} 1929#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2 1930#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2 1931 1932#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ 1933static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ 1934 OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ 1935}\ 1936 1937#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ 1938static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1939 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ 1940}\ 1941\ 1942static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1943 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ 1944}\ 1945\ 1946static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1947 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ 1948}\ 1949 1950#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ 1951static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1952 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ 1953 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 1954 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ 1955}\ 1956\ 1957static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1958 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ 1959}\ 1960\ 1961static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1962 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ 1963 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 1964 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ 1965}\ 1966 1967#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ 1968static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1969 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ 1970 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 1971 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ 1972}\ 1973\ 1974static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1975 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ 1976 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ 1977 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ 1978}\ 1979\ 1980static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1981 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ 1982 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 1983 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ 1984}\ 1985\ 1986static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1987 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ 1988 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ 1989 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ 1990}\ 1991\ 1992static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1993 DECLARE_ALIGNED(ALIGN, uint16_t, temp[SIZE*(SIZE<8?12:24)]);\ 1994 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ 1995}\ 1996\ 1997static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1998 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ 1999 uint8_t * const halfHV= temp;\ 2000 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 2001 assert(((int)temp & 7) == 0);\ 2002 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 2003 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ 2004}\ 2005\ 2006static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 2007 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ 2008 uint8_t * const halfHV= temp;\ 2009 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 2010 assert(((int)temp & 7) == 0);\ 2011 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 2012 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ 2013}\ 2014\ 2015static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 2016 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ 2017 uint8_t * const halfHV= temp;\ 2018 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 2019 assert(((int)temp & 7) == 0);\ 2020 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 2021 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ 2022}\ 2023\ 2024static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 2025 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ 2026 uint8_t * const halfHV= temp;\ 2027 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 2028 assert(((int)temp & 7) == 0);\ 2029 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 2030 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ 2031}\ 2032 2033#define H264_MC_4816(MMX)\ 2034H264_MC(put_, 4, MMX, 8)\ 2035H264_MC(put_, 8, MMX, 8)\ 2036H264_MC(put_, 16,MMX, 8)\ 2037H264_MC(avg_, 4, MMX, 8)\ 2038H264_MC(avg_, 8, MMX, 8)\ 2039H264_MC(avg_, 16,MMX, 8)\ 2040 2041#define H264_MC_816(QPEL, XMM)\ 2042QPEL(put_, 8, XMM, 16)\ 2043QPEL(put_, 16,XMM, 16)\ 2044QPEL(avg_, 8, XMM, 16)\ 2045QPEL(avg_, 16,XMM, 16)\ 2046 2047 2048#define AVG_3DNOW_OP(a,b,temp, size) \ 2049"mov" #size " " #b ", " #temp " \n\t"\ 2050"pavgusb " #temp ", " #a " \n\t"\ 2051"mov" #size " " #a ", " #b " \n\t" 2052#define AVG_MMX2_OP(a,b,temp, size) \ 2053"mov" #size " " #b ", " #temp " \n\t"\ 2054"pavgb " #temp ", " #a " \n\t"\ 2055"mov" #size " " #a ", " #b " \n\t" 2056 2057#define PAVGB "pavgusb" 2058QPEL_H264(put_, PUT_OP, 3dnow) 2059QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) 2060#undef PAVGB 2061#define PAVGB "pavgb" 2062QPEL_H264(put_, PUT_OP, mmx2) 2063QPEL_H264(avg_, AVG_MMX2_OP, mmx2) 2064QPEL_H264_V_XMM(put_, PUT_OP, sse2) 2065QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2) 2066QPEL_H264_HV_XMM(put_, PUT_OP, sse2) 2067QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2) 2068#if HAVE_SSSE3 2069QPEL_H264_H_XMM(put_, PUT_OP, ssse3) 2070QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3) 2071QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) 2072QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3) 2073QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) 2074QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) 2075#endif 2076#undef PAVGB 2077 2078H264_MC_4816(3dnow) 2079H264_MC_4816(mmx2) 2080H264_MC_816(H264_MC_V, sse2) 2081H264_MC_816(H264_MC_HV, sse2) 2082#if HAVE_SSSE3 2083H264_MC_816(H264_MC_H, ssse3) 2084H264_MC_816(H264_MC_HV, ssse3) 2085#endif 2086 2087/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */ 2088DECLARE_ALIGNED_8(static const uint64_t, h264_rnd_reg[4]) = { 2089 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL 2090}; 2091 2092#define H264_CHROMA_OP(S,D) 2093#define H264_CHROMA_OP4(S,D,T) 2094#define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx 2095#define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx 2096#define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2 2097#define H264_CHROMA_MC8_MV0 put_pixels8_mmx 2098#include "dsputil_h264_template_mmx.c" 2099 2100static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2101{ 2102 put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg); 2103} 2104static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2105{ 2106 put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg+2); 2107} 2108static void put_h264_chroma_mc4_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2109{ 2110 put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, h264_rnd_reg); 2111} 2112 2113#undef H264_CHROMA_OP 2114#undef H264_CHROMA_OP4 2115#undef H264_CHROMA_MC8_TMPL 2116#undef H264_CHROMA_MC4_TMPL 2117#undef H264_CHROMA_MC2_TMPL 2118#undef H264_CHROMA_MC8_MV0 2119 2120#define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t" 2121#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\ 2122 "pavgb " #T ", " #D " \n\t" 2123#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_mmx2 2124#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_mmx2 2125#define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2 2126#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 2127#include "dsputil_h264_template_mmx.c" 2128static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2129{ 2130 avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg); 2131} 2132static void avg_h264_chroma_mc4_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2133{ 2134 avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, h264_rnd_reg); 2135} 2136#undef H264_CHROMA_OP 2137#undef H264_CHROMA_OP4 2138#undef H264_CHROMA_MC8_TMPL 2139#undef H264_CHROMA_MC4_TMPL 2140#undef H264_CHROMA_MC2_TMPL 2141#undef H264_CHROMA_MC8_MV0 2142 2143#define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t" 2144#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\ 2145 "pavgusb " #T ", " #D " \n\t" 2146#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_3dnow 2147#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_3dnow 2148#define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow 2149#include "dsputil_h264_template_mmx.c" 2150static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2151{ 2152 avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, h264_rnd_reg); 2153} 2154static void avg_h264_chroma_mc4_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2155{ 2156 avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, h264_rnd_reg); 2157} 2158#undef H264_CHROMA_OP 2159#undef H264_CHROMA_OP4 2160#undef H264_CHROMA_MC8_TMPL 2161#undef H264_CHROMA_MC4_TMPL 2162#undef H264_CHROMA_MC8_MV0 2163 2164#if HAVE_SSSE3 2165#define AVG_OP(X) 2166#undef H264_CHROMA_MC8_TMPL 2167#undef H264_CHROMA_MC4_TMPL 2168#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3 2169#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3 2170#define H264_CHROMA_MC8_MV0 put_pixels8_mmx 2171#include "dsputil_h264_template_ssse3.c" 2172static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2173{ 2174 put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); 2175} 2176static void put_h264_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2177{ 2178 put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0); 2179} 2180 2181#undef AVG_OP 2182#undef H264_CHROMA_MC8_TMPL 2183#undef H264_CHROMA_MC4_TMPL 2184#undef H264_CHROMA_MC8_MV0 2185#define AVG_OP(X) X 2186#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3 2187#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3 2188#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 2189#include "dsputil_h264_template_ssse3.c" 2190static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2191{ 2192 avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); 2193} 2194#undef AVG_OP 2195#undef H264_CHROMA_MC8_TMPL 2196#undef H264_CHROMA_MC4_TMPL 2197#undef H264_CHROMA_MC8_MV0 2198#endif 2199 2200/***********************************/ 2201/* weighted prediction */ 2202 2203static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) 2204{ 2205 int x, y; 2206 offset <<= log2_denom; 2207 offset += (1 << log2_denom) >> 1; 2208 __asm__ volatile( 2209 "movd %0, %%mm4 \n\t" 2210 "movd %1, %%mm5 \n\t" 2211 "movd %2, %%mm6 \n\t" 2212 "pshufw $0, %%mm4, %%mm4 \n\t" 2213 "pshufw $0, %%mm5, %%mm5 \n\t" 2214 "pxor %%mm7, %%mm7 \n\t" 2215 :: "g"(weight), "g"(offset), "g"(log2_denom) 2216 ); 2217 for(y=0; y<h; y+=2){ 2218 for(x=0; x<w; x+=4){ 2219 __asm__ volatile( 2220 "movd %0, %%mm0 \n\t" 2221 "movd %1, %%mm1 \n\t" 2222 "punpcklbw %%mm7, %%mm0 \n\t" 2223 "punpcklbw %%mm7, %%mm1 \n\t" 2224 "pmullw %%mm4, %%mm0 \n\t" 2225 "pmullw %%mm4, %%mm1 \n\t" 2226 "paddsw %%mm5, %%mm0 \n\t" 2227 "paddsw %%mm5, %%mm1 \n\t" 2228 "psraw %%mm6, %%mm0 \n\t" 2229 "psraw %%mm6, %%mm1 \n\t" 2230 "packuswb %%mm7, %%mm0 \n\t" 2231 "packuswb %%mm7, %%mm1 \n\t" 2232 "movd %%mm0, %0 \n\t" 2233 "movd %%mm1, %1 \n\t" 2234 : "+m"(*(uint32_t*)(dst+x)), 2235 "+m"(*(uint32_t*)(dst+x+stride)) 2236 ); 2237 } 2238 dst += 2*stride; 2239 } 2240} 2241 2242static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h) 2243{ 2244 int x, y; 2245 offset = ((offset + 1) | 1) << log2_denom; 2246 __asm__ volatile( 2247 "movd %0, %%mm3 \n\t" 2248 "movd %1, %%mm4 \n\t" 2249 "movd %2, %%mm5 \n\t" 2250 "movd %3, %%mm6 \n\t" 2251 "pshufw $0, %%mm3, %%mm3 \n\t" 2252 "pshufw $0, %%mm4, %%mm4 \n\t" 2253 "pshufw $0, %%mm5, %%mm5 \n\t" 2254 "pxor %%mm7, %%mm7 \n\t" 2255 :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1) 2256 ); 2257 for(y=0; y<h; y++){ 2258 for(x=0; x<w; x+=4){ 2259 __asm__ volatile( 2260 "movd %0, %%mm0 \n\t" 2261 "movd %1, %%mm1 \n\t" 2262 "punpcklbw %%mm7, %%mm0 \n\t" 2263 "punpcklbw %%mm7, %%mm1 \n\t" 2264 "pmullw %%mm3, %%mm0 \n\t" 2265 "pmullw %%mm4, %%mm1 \n\t" 2266 "paddsw %%mm1, %%mm0 \n\t" 2267 "paddsw %%mm5, %%mm0 \n\t" 2268 "psraw %%mm6, %%mm0 \n\t" 2269 "packuswb %%mm0, %%mm0 \n\t" 2270 "movd %%mm0, %0 \n\t" 2271 : "+m"(*(uint32_t*)(dst+x)) 2272 : "m"(*(uint32_t*)(src+x)) 2273 ); 2274 } 2275 src += stride; 2276 dst += stride; 2277 } 2278} 2279 2280#define H264_WEIGHT(W,H) \ 2281static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ 2282 ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \ 2283} \ 2284static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \ 2285 ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \ 2286} 2287 2288H264_WEIGHT(16,16) 2289H264_WEIGHT(16, 8) 2290H264_WEIGHT( 8,16) 2291H264_WEIGHT( 8, 8) 2292H264_WEIGHT( 8, 4) 2293H264_WEIGHT( 4, 8) 2294H264_WEIGHT( 4, 4) 2295H264_WEIGHT( 4, 2) 2296 2297