1/* 2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "dsputil_mmx.h" 22 23DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; 24DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; 25 26/***********************************/ 27/* IDCT */ 28 29#define SUMSUB_BADC( a, b, c, d ) \ 30 "paddw "#b", "#a" \n\t"\ 31 "paddw "#d", "#c" \n\t"\ 32 "paddw "#b", "#b" \n\t"\ 33 "paddw "#d", "#d" \n\t"\ 34 "psubw "#a", "#b" \n\t"\ 35 "psubw "#c", "#d" \n\t" 36 37#define SUMSUBD2_AB( a, b, t ) \ 38 "movq "#b", "#t" \n\t"\ 39 "psraw $1 , "#b" \n\t"\ 40 "paddw "#a", "#b" \n\t"\ 41 "psraw $1 , "#a" \n\t"\ 42 "psubw "#t", "#a" \n\t" 43 44#define IDCT4_1D( s02, s13, d02, d13, t ) \ 45 SUMSUB_BA ( s02, d02 )\ 46 SUMSUBD2_AB( s13, d13, t )\ 47 SUMSUB_BADC( d13, s02, s13, d02 ) 48 49#define STORE_DIFF_4P( p, t, z ) \ 50 "psraw $6, "#p" \n\t"\ 51 "movd (%0), "#t" \n\t"\ 52 "punpcklbw "#z", "#t" \n\t"\ 53 "paddsw "#t", "#p" \n\t"\ 54 "packuswb "#z", "#p" \n\t"\ 55 "movd "#p", (%0) \n\t" 56 57static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) 58{ 59 /* Load dct coeffs */ 60 __asm__ volatile( 61 "movq (%0), %%mm0 \n\t" 62 "movq 8(%0), %%mm1 \n\t" 63 "movq 16(%0), %%mm2 \n\t" 64 "movq 24(%0), %%mm3 \n\t" 65 :: "r"(block) ); 66 67 __asm__ volatile( 68 /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ 69 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) 70 71 "movq %0, %%mm6 \n\t" 72 /* in: 1,4,0,2 out: 1,2,3,0 */ 73 TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 ) 74 75 "paddw %%mm6, %%mm3 \n\t" 76 77 /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ 78 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) 79 80 "pxor %%mm7, %%mm7 \n\t" 81 :: "m"(ff_pw_32)); 82 83 __asm__ volatile( 84 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) 85 "add %1, %0 \n\t" 86 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) 87 "add %1, %0 \n\t" 88 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) 89 "add %1, %0 \n\t" 90 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) 91 : "+r"(dst) 92 : "r" ((x86_reg)stride) 93 ); 94} 95 96static inline void h264_idct8_1d(int16_t *block) 97{ 98 __asm__ volatile( 99 "movq 112(%0), %%mm7 \n\t" 100 "movq 80(%0), %%mm0 \n\t" 101 "movq 48(%0), %%mm3 \n\t" 102 "movq 16(%0), %%mm5 \n\t" 103 104 "movq %%mm0, %%mm4 \n\t" 105 "movq %%mm5, %%mm1 \n\t" 106 "psraw $1, %%mm4 \n\t" 107 "psraw $1, %%mm1 \n\t" 108 "paddw %%mm0, %%mm4 \n\t" 109 "paddw %%mm5, %%mm1 \n\t" 110 "paddw %%mm7, %%mm4 \n\t" 111 "paddw %%mm0, %%mm1 \n\t" 112 "psubw %%mm5, %%mm4 \n\t" 113 "paddw %%mm3, %%mm1 \n\t" 114 115 "psubw %%mm3, %%mm5 \n\t" 116 "psubw %%mm3, %%mm0 \n\t" 117 "paddw %%mm7, %%mm5 \n\t" 118 "psubw %%mm7, %%mm0 \n\t" 119 "psraw $1, %%mm3 \n\t" 120 "psraw $1, %%mm7 \n\t" 121 "psubw %%mm3, %%mm5 \n\t" 122 "psubw %%mm7, %%mm0 \n\t" 123 124 "movq %%mm4, %%mm3 \n\t" 125 "movq %%mm1, %%mm7 \n\t" 126 "psraw $2, %%mm1 \n\t" 127 "psraw $2, %%mm3 \n\t" 128 "paddw %%mm5, %%mm3 \n\t" 129 "psraw $2, %%mm5 \n\t" 130 "paddw %%mm0, %%mm1 \n\t" 131 "psraw $2, %%mm0 \n\t" 132 "psubw %%mm4, %%mm5 \n\t" 133 "psubw %%mm0, %%mm7 \n\t" 134 135 "movq 32(%0), %%mm2 \n\t" 136 "movq 96(%0), %%mm6 \n\t" 137 "movq %%mm2, %%mm4 \n\t" 138 "movq %%mm6, %%mm0 \n\t" 139 "psraw $1, %%mm4 \n\t" 140 "psraw $1, %%mm6 \n\t" 141 "psubw %%mm0, %%mm4 \n\t" 142 "paddw %%mm2, %%mm6 \n\t" 143 144 "movq (%0), %%mm2 \n\t" 145 "movq 64(%0), %%mm0 \n\t" 146 SUMSUB_BA( %%mm0, %%mm2 ) 147 SUMSUB_BA( %%mm6, %%mm0 ) 148 SUMSUB_BA( %%mm4, %%mm2 ) 149 SUMSUB_BA( %%mm7, %%mm6 ) 150 SUMSUB_BA( %%mm5, %%mm4 ) 151 SUMSUB_BA( %%mm3, %%mm2 ) 152 SUMSUB_BA( %%mm1, %%mm0 ) 153 :: "r"(block) 154 ); 155} 156 157static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) 158{ 159 int i; 160 DECLARE_ALIGNED(8, int16_t, b2)[64]; 161 162 block[0] += 32; 163 164 for(i=0; i<2; i++){ 165 DECLARE_ALIGNED(8, uint64_t, tmp); 166 167 h264_idct8_1d(block+4*i); 168 169 __asm__ volatile( 170 "movq %%mm7, %0 \n\t" 171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) 172 "movq %%mm0, 8(%1) \n\t" 173 "movq %%mm6, 24(%1) \n\t" 174 "movq %%mm7, 40(%1) \n\t" 175 "movq %%mm4, 56(%1) \n\t" 176 "movq %0, %%mm7 \n\t" 177 TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) 178 "movq %%mm7, (%1) \n\t" 179 "movq %%mm1, 16(%1) \n\t" 180 "movq %%mm0, 32(%1) \n\t" 181 "movq %%mm3, 48(%1) \n\t" 182 : "=m"(tmp) 183 : "r"(b2+32*i) 184 : "memory" 185 ); 186 } 187 188 for(i=0; i<2; i++){ 189 h264_idct8_1d(b2+4*i); 190 191 __asm__ volatile( 192 "psraw $6, %%mm7 \n\t" 193 "psraw $6, %%mm6 \n\t" 194 "psraw $6, %%mm5 \n\t" 195 "psraw $6, %%mm4 \n\t" 196 "psraw $6, %%mm3 \n\t" 197 "psraw $6, %%mm2 \n\t" 198 "psraw $6, %%mm1 \n\t" 199 "psraw $6, %%mm0 \n\t" 200 201 "movq %%mm7, (%0) \n\t" 202 "movq %%mm5, 16(%0) \n\t" 203 "movq %%mm3, 32(%0) \n\t" 204 "movq %%mm1, 48(%0) \n\t" 205 "movq %%mm0, 64(%0) \n\t" 206 "movq %%mm2, 80(%0) \n\t" 207 "movq %%mm4, 96(%0) \n\t" 208 "movq %%mm6, 112(%0) \n\t" 209 :: "r"(b2+4*i) 210 : "memory" 211 ); 212 } 213 214 add_pixels_clamped_mmx(b2, dst, stride); 215} 216 217#define STORE_DIFF_8P( p, d, t, z )\ 218 "movq "#d", "#t" \n"\ 219 "psraw $6, "#p" \n"\ 220 "punpcklbw "#z", "#t" \n"\ 221 "paddsw "#t", "#p" \n"\ 222 "packuswb "#p", "#p" \n"\ 223 "movq "#p", "#d" \n" 224 225#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\ 226 "movdqa "#c", "#a" \n"\ 227 "movdqa "#g", "#e" \n"\ 228 "psraw $1, "#c" \n"\ 229 "psraw $1, "#g" \n"\ 230 "psubw "#e", "#c" \n"\ 231 "paddw "#a", "#g" \n"\ 232 "movdqa "#b", "#e" \n"\ 233 "psraw $1, "#e" \n"\ 234 "paddw "#b", "#e" \n"\ 235 "paddw "#d", "#e" \n"\ 236 "paddw "#f", "#e" \n"\ 237 "movdqa "#f", "#a" \n"\ 238 "psraw $1, "#a" \n"\ 239 "paddw "#f", "#a" \n"\ 240 "paddw "#h", "#a" \n"\ 241 "psubw "#b", "#a" \n"\ 242 "psubw "#d", "#b" \n"\ 243 "psubw "#d", "#f" \n"\ 244 "paddw "#h", "#b" \n"\ 245 "psubw "#h", "#f" \n"\ 246 "psraw $1, "#d" \n"\ 247 "psraw $1, "#h" \n"\ 248 "psubw "#d", "#b" \n"\ 249 "psubw "#h", "#f" \n"\ 250 "movdqa "#e", "#d" \n"\ 251 "movdqa "#a", "#h" \n"\ 252 "psraw $2, "#d" \n"\ 253 "psraw $2, "#h" \n"\ 254 "paddw "#f", "#d" \n"\ 255 "paddw "#b", "#h" \n"\ 256 "psraw $2, "#f" \n"\ 257 "psraw $2, "#b" \n"\ 258 "psubw "#f", "#e" \n"\ 259 "psubw "#a", "#b" \n"\ 260 "movdqa 0x00(%1), "#a" \n"\ 261 "movdqa 0x40(%1), "#f" \n"\ 262 SUMSUB_BA(f, a)\ 263 SUMSUB_BA(g, f)\ 264 SUMSUB_BA(c, a)\ 265 SUMSUB_BA(e, g)\ 266 SUMSUB_BA(b, c)\ 267 SUMSUB_BA(h, a)\ 268 SUMSUB_BA(d, f) 269 270static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) 271{ 272 __asm__ volatile( 273 "movdqa 0x10(%1), %%xmm1 \n" 274 "movdqa 0x20(%1), %%xmm2 \n" 275 "movdqa 0x30(%1), %%xmm3 \n" 276 "movdqa 0x50(%1), %%xmm5 \n" 277 "movdqa 0x60(%1), %%xmm6 \n" 278 "movdqa 0x70(%1), %%xmm7 \n" 279 H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) 280 TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1)) 281 "paddw %4, %%xmm4 \n" 282 "movdqa %%xmm4, 0x00(%1) \n" 283 "movdqa %%xmm2, 0x40(%1) \n" 284 H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1) 285 "movdqa %%xmm6, 0x60(%1) \n" 286 "movdqa %%xmm7, 0x70(%1) \n" 287 "pxor %%xmm7, %%xmm7 \n" 288 STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7) 289 STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7) 290 STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7) 291 STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7) 292 "lea (%0,%2,4), %0 \n" 293 STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7) 294 STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7) 295 "movdqa 0x60(%1), %%xmm0 \n" 296 "movdqa 0x70(%1), %%xmm1 \n" 297 STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7) 298 STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7) 299 :"+r"(dst) 300 :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32) 301 ); 302} 303 304static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) 305{ 306 int dc = (block[0] + 32) >> 6; 307 __asm__ volatile( 308 "movd %0, %%mm0 \n\t" 309 "pshufw $0, %%mm0, %%mm0 \n\t" 310 "pxor %%mm1, %%mm1 \n\t" 311 "psubw %%mm0, %%mm1 \n\t" 312 "packuswb %%mm0, %%mm0 \n\t" 313 "packuswb %%mm1, %%mm1 \n\t" 314 ::"r"(dc) 315 ); 316 __asm__ volatile( 317 "movd %0, %%mm2 \n\t" 318 "movd %1, %%mm3 \n\t" 319 "movd %2, %%mm4 \n\t" 320 "movd %3, %%mm5 \n\t" 321 "paddusb %%mm0, %%mm2 \n\t" 322 "paddusb %%mm0, %%mm3 \n\t" 323 "paddusb %%mm0, %%mm4 \n\t" 324 "paddusb %%mm0, %%mm5 \n\t" 325 "psubusb %%mm1, %%mm2 \n\t" 326 "psubusb %%mm1, %%mm3 \n\t" 327 "psubusb %%mm1, %%mm4 \n\t" 328 "psubusb %%mm1, %%mm5 \n\t" 329 "movd %%mm2, %0 \n\t" 330 "movd %%mm3, %1 \n\t" 331 "movd %%mm4, %2 \n\t" 332 "movd %%mm5, %3 \n\t" 333 :"+m"(*(uint32_t*)(dst+0*stride)), 334 "+m"(*(uint32_t*)(dst+1*stride)), 335 "+m"(*(uint32_t*)(dst+2*stride)), 336 "+m"(*(uint32_t*)(dst+3*stride)) 337 ); 338} 339 340static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) 341{ 342 int dc = (block[0] + 32) >> 6; 343 int y; 344 __asm__ volatile( 345 "movd %0, %%mm0 \n\t" 346 "pshufw $0, %%mm0, %%mm0 \n\t" 347 "pxor %%mm1, %%mm1 \n\t" 348 "psubw %%mm0, %%mm1 \n\t" 349 "packuswb %%mm0, %%mm0 \n\t" 350 "packuswb %%mm1, %%mm1 \n\t" 351 ::"r"(dc) 352 ); 353 for(y=2; y--; dst += 4*stride){ 354 __asm__ volatile( 355 "movq %0, %%mm2 \n\t" 356 "movq %1, %%mm3 \n\t" 357 "movq %2, %%mm4 \n\t" 358 "movq %3, %%mm5 \n\t" 359 "paddusb %%mm0, %%mm2 \n\t" 360 "paddusb %%mm0, %%mm3 \n\t" 361 "paddusb %%mm0, %%mm4 \n\t" 362 "paddusb %%mm0, %%mm5 \n\t" 363 "psubusb %%mm1, %%mm2 \n\t" 364 "psubusb %%mm1, %%mm3 \n\t" 365 "psubusb %%mm1, %%mm4 \n\t" 366 "psubusb %%mm1, %%mm5 \n\t" 367 "movq %%mm2, %0 \n\t" 368 "movq %%mm3, %1 \n\t" 369 "movq %%mm4, %2 \n\t" 370 "movq %%mm5, %3 \n\t" 371 :"+m"(*(uint64_t*)(dst+0*stride)), 372 "+m"(*(uint64_t*)(dst+1*stride)), 373 "+m"(*(uint64_t*)(dst+2*stride)), 374 "+m"(*(uint64_t*)(dst+3*stride)) 375 ); 376 } 377} 378 379//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split 380static const uint8_t scan8[16 + 2*4]={ 381 4+1*8, 5+1*8, 4+2*8, 5+2*8, 382 6+1*8, 7+1*8, 6+2*8, 7+2*8, 383 4+3*8, 5+3*8, 4+4*8, 5+4*8, 384 6+3*8, 7+3*8, 6+4*8, 7+4*8, 385 1+1*8, 2+1*8, 386 1+2*8, 2+2*8, 387 1+4*8, 2+4*8, 388 1+5*8, 2+5*8, 389}; 390 391static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 392 int i; 393 for(i=0; i<16; i++){ 394 if(nnzc[ scan8[i] ]) 395 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); 396 } 397} 398 399static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 400 int i; 401 for(i=0; i<16; i+=4){ 402 if(nnzc[ scan8[i] ]) 403 ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride); 404 } 405} 406 407 408static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 409 int i; 410 for(i=0; i<16; i++){ 411 int nnz = nnzc[ scan8[i] ]; 412 if(nnz){ 413 if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); 414 else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); 415 } 416 } 417} 418 419static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 420 int i; 421 for(i=0; i<16; i++){ 422 if(nnzc[ scan8[i] ] || block[i*16]) 423 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); 424 } 425} 426 427static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 428 int i; 429 for(i=0; i<16; i++){ 430 if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); 431 else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); 432 } 433} 434 435static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 436 int i; 437 for(i=0; i<16; i+=4){ 438 int nnz = nnzc[ scan8[i] ]; 439 if(nnz){ 440 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); 441 else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride); 442 } 443 } 444} 445 446static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 447 int i; 448 for(i=0; i<16; i+=4){ 449 int nnz = nnzc[ scan8[i] ]; 450 if(nnz){ 451 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); 452 else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride); 453 } 454 } 455} 456 457static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 458 int i; 459 for(i=16; i<16+8; i++){ 460 if(nnzc[ scan8[i] ] || block[i*16]) 461 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 462 } 463} 464 465static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 466 int i; 467 for(i=16; i<16+8; i++){ 468 if(nnzc[ scan8[i] ]) 469 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 470 else if(block[i*16]) 471 ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 472 } 473} 474 475#if CONFIG_GPL && HAVE_YASM 476static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride) 477{ 478 __asm__ volatile( 479 "movd %0, %%mm0 \n\t" // 0 0 X D 480 "punpcklwd %1, %%mm0 \n\t" // x X d D 481 "paddsw %2, %%mm0 \n\t" 482 "psraw $6, %%mm0 \n\t" 483 "punpcklwd %%mm0, %%mm0 \n\t" // d d D D 484 "pxor %%mm1, %%mm1 \n\t" // 0 0 0 0 485 "psubw %%mm0, %%mm1 \n\t" // -d-d-D-D 486 "packuswb %%mm1, %%mm0 \n\t" // -d-d-D-D d d D D 487 "pshufw $0xFA, %%mm0, %%mm1 \n\t" // -d-d-d-d-D-D-D-D 488 "punpcklwd %%mm0, %%mm0 \n\t" // d d d d D D D D 489 ::"m"(block[ 0]), 490 "m"(block[16]), 491 "m"(ff_pw_32) 492 ); 493 __asm__ volatile( 494 "movq %0, %%mm2 \n\t" 495 "movq %1, %%mm3 \n\t" 496 "movq %2, %%mm4 \n\t" 497 "movq %3, %%mm5 \n\t" 498 "paddusb %%mm0, %%mm2 \n\t" 499 "paddusb %%mm0, %%mm3 \n\t" 500 "paddusb %%mm0, %%mm4 \n\t" 501 "paddusb %%mm0, %%mm5 \n\t" 502 "psubusb %%mm1, %%mm2 \n\t" 503 "psubusb %%mm1, %%mm3 \n\t" 504 "psubusb %%mm1, %%mm4 \n\t" 505 "psubusb %%mm1, %%mm5 \n\t" 506 "movq %%mm2, %0 \n\t" 507 "movq %%mm3, %1 \n\t" 508 "movq %%mm4, %2 \n\t" 509 "movq %%mm5, %3 \n\t" 510 :"+m"(*(uint64_t*)(dst+0*stride)), 511 "+m"(*(uint64_t*)(dst+1*stride)), 512 "+m"(*(uint64_t*)(dst+2*stride)), 513 "+m"(*(uint64_t*)(dst+3*stride)) 514 ); 515} 516 517extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride); 518 519static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 520 int i; 521 for(i=0; i<16; i+=2) 522 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) 523 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride); 524} 525 526static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 527 int i; 528 for(i=0; i<16; i+=2){ 529 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) 530 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride); 531 else if(block[i*16]|block[i*16+16]) 532 ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride); 533 } 534} 535 536static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 537 int i; 538 for(i=16; i<16+8; i+=2){ 539 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) 540 ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 541 else if(block[i*16]|block[i*16+16]) 542 ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 543 } 544} 545#endif 546 547/***********************************/ 548/* deblocking */ 549 550// out: o = |x-y|>a 551// clobbers: t 552#define DIFF_GT_MMX(x,y,a,o,t)\ 553 "movq "#y", "#t" \n\t"\ 554 "movq "#x", "#o" \n\t"\ 555 "psubusb "#x", "#t" \n\t"\ 556 "psubusb "#y", "#o" \n\t"\ 557 "por "#t", "#o" \n\t"\ 558 "psubusb "#a", "#o" \n\t" 559 560// out: o = |x-y|>a 561// clobbers: t 562#define DIFF_GT2_MMX(x,y,a,o,t)\ 563 "movq "#y", "#t" \n\t"\ 564 "movq "#x", "#o" \n\t"\ 565 "psubusb "#x", "#t" \n\t"\ 566 "psubusb "#y", "#o" \n\t"\ 567 "psubusb "#a", "#t" \n\t"\ 568 "psubusb "#a", "#o" \n\t"\ 569 "pcmpeqb "#t", "#o" \n\t"\ 570 571// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 572// out: mm5=beta-1, mm7=mask 573// clobbers: mm4,mm6 574#define H264_DEBLOCK_MASK(alpha1, beta1) \ 575 "pshufw $0, "#alpha1", %%mm4 \n\t"\ 576 "pshufw $0, "#beta1 ", %%mm5 \n\t"\ 577 "packuswb %%mm4, %%mm4 \n\t"\ 578 "packuswb %%mm5, %%mm5 \n\t"\ 579 DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\ 580 DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\ 581 "por %%mm4, %%mm7 \n\t"\ 582 DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\ 583 "por %%mm4, %%mm7 \n\t"\ 584 "pxor %%mm6, %%mm6 \n\t"\ 585 "pcmpeqb %%mm6, %%mm7 \n\t" 586 587// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) 588// out: mm1=p0' mm2=q0' 589// clobbers: mm0,3-6 590#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\ 591 "movq %%mm1 , %%mm5 \n\t"\ 592 "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\ 593 "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\ 594 "pcmpeqb %%mm4 , %%mm4 \n\t"\ 595 "pxor %%mm4 , %%mm3 \n\t"\ 596 "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\ 597 "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\ 598 "pxor %%mm1 , %%mm4 \n\t"\ 599 "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\ 600 "pavgb %%mm5 , %%mm3 \n\t"\ 601 "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\ 602 "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\ 603 "psubusb %%mm3 , %%mm6 \n\t"\ 604 "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\ 605 "pminub %%mm7 , %%mm6 \n\t"\ 606 "pminub %%mm7 , %%mm3 \n\t"\ 607 "psubusb %%mm6 , %%mm1 \n\t"\ 608 "psubusb %%mm3 , %%mm2 \n\t"\ 609 "paddusb %%mm3 , %%mm1 \n\t"\ 610 "paddusb %%mm6 , %%mm2 \n\t" 611 612// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone 613// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) 614// clobbers: q2, tmp, tc0 615#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ 616 "movq %%mm1, "#tmp" \n\t"\ 617 "pavgb %%mm2, "#tmp" \n\t"\ 618 "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ 619 "pxor "q2addr", "#tmp" \n\t"\ 620 "pand %9, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ 621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ 622 "movq "#p1", "#tmp" \n\t"\ 623 "psubusb "#tc0", "#tmp" \n\t"\ 624 "paddusb "#p1", "#tc0" \n\t"\ 625 "pmaxub "#tmp", "#q2" \n\t"\ 626 "pminub "#tc0", "#q2" \n\t"\ 627 "movq "#q2", "q1addr" \n\t" 628 629static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) 630{ 631 DECLARE_ALIGNED(8, uint64_t, tmp0)[2]; 632 633 __asm__ volatile( 634 "movq (%2,%4), %%mm0 \n\t" //p1 635 "movq (%2,%4,2), %%mm1 \n\t" //p0 636 "movq (%3), %%mm2 \n\t" //q0 637 "movq (%3,%4), %%mm3 \n\t" //q1 638 H264_DEBLOCK_MASK(%7, %8) 639 640 "movd %6, %%mm4 \n\t" 641 "punpcklbw %%mm4, %%mm4 \n\t" 642 "punpcklwd %%mm4, %%mm4 \n\t" 643 "pcmpeqb %%mm3, %%mm3 \n\t" 644 "movq %%mm4, %%mm6 \n\t" 645 "pcmpgtb %%mm3, %%mm4 \n\t" 646 "movq %%mm6, %1 \n\t" 647 "pand %%mm4, %%mm7 \n\t" 648 "movq %%mm7, %0 \n\t" 649 650 /* filter p1 */ 651 "movq (%2), %%mm3 \n\t" //p2 652 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 653 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta 654 "pand %1, %%mm7 \n\t" // mask & tc0 655 "movq %%mm7, %%mm4 \n\t" 656 "psubb %%mm6, %%mm7 \n\t" 657 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 658 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4) 659 660 /* filter q1 */ 661 "movq (%3,%4,2), %%mm4 \n\t" //q2 662 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 663 "pand %0, %%mm6 \n\t" 664 "movq %1, %%mm5 \n\t" // can be merged with the and below but is slower then 665 "pand %%mm6, %%mm5 \n\t" 666 "psubb %%mm6, %%mm7 \n\t" 667 "movq (%3,%4), %%mm3 \n\t" 668 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6) 669 670 /* filter p0, q0 */ 671 H264_DEBLOCK_P0_Q0(%9, unused) 672 "movq %%mm1, (%2,%4,2) \n\t" 673 "movq %%mm2, (%3) \n\t" 674 675 : "=m"(tmp0[0]), "=m"(tmp0[1]) 676 : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride), 677 "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), 678 "m"(ff_bone) 679 ); 680} 681 682static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 683{ 684 if((tc0[0] & tc0[1]) >= 0) 685 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); 686 if((tc0[2] & tc0[3]) >= 0) 687 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); 688} 689static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 690{ 691 //FIXME: could cut some load/stores by merging transpose with filter 692 // also, it only needs to transpose 6x8 693 DECLARE_ALIGNED(8, uint8_t, trans)[8*8]; 694 int i; 695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { 696 if((tc0[0] & tc0[1]) < 0) 697 continue; 698 transpose4x4(trans, pix-4, 8, stride); 699 transpose4x4(trans +4*8, pix, 8, stride); 700 transpose4x4(trans+4, pix-4+4*stride, 8, stride); 701 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); 702 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); 703 transpose4x4(pix-2, trans +2*8, stride, 8); 704 transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8); 705 } 706} 707 708static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) 709{ 710 __asm__ volatile( 711 "movq (%0), %%mm0 \n\t" //p1 712 "movq (%0,%2), %%mm1 \n\t" //p0 713 "movq (%1), %%mm2 \n\t" //q0 714 "movq (%1,%2), %%mm3 \n\t" //q1 715 H264_DEBLOCK_MASK(%4, %5) 716 "movd %3, %%mm6 \n\t" 717 "punpcklbw %%mm6, %%mm6 \n\t" 718 "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask 719 H264_DEBLOCK_P0_Q0(%6, %7) 720 "movq %%mm1, (%0,%2) \n\t" 721 "movq %%mm2, (%1) \n\t" 722 723 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), 724 "r"(*(uint32_t*)tc0), 725 "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F) 726 ); 727} 728 729static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 730{ 731 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); 732} 733 734static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 735{ 736 //FIXME: could cut some load/stores by merging transpose with filter 737 DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; 738 transpose4x4(trans, pix-2, 8, stride); 739 transpose4x4(trans+4, pix-2+4*stride, 8, stride); 740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); 741 transpose4x4(pix-2, trans, stride, 8); 742 transpose4x4(pix-2+4*stride, trans+4, stride, 8); 743} 744 745// p0 = (p0 + q1 + 2*p1 + 2) >> 2 746#define H264_FILTER_CHROMA4(p0, p1, q1, one) \ 747 "movq "#p0", %%mm4 \n\t"\ 748 "pxor "#q1", %%mm4 \n\t"\ 749 "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\ 750 "pavgb "#q1", "#p0" \n\t"\ 751 "psubusb %%mm4, "#p0" \n\t"\ 752 "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ 753 754static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) 755{ 756 __asm__ volatile( 757 "movq (%0), %%mm0 \n\t" 758 "movq (%0,%2), %%mm1 \n\t" 759 "movq (%1), %%mm2 \n\t" 760 "movq (%1,%2), %%mm3 \n\t" 761 H264_DEBLOCK_MASK(%3, %4) 762 "movq %%mm1, %%mm5 \n\t" 763 "movq %%mm2, %%mm6 \n\t" 764 H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0' 765 H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0' 766 "psubb %%mm5, %%mm1 \n\t" 767 "psubb %%mm6, %%mm2 \n\t" 768 "pand %%mm7, %%mm1 \n\t" 769 "pand %%mm7, %%mm2 \n\t" 770 "paddb %%mm5, %%mm1 \n\t" 771 "paddb %%mm6, %%mm2 \n\t" 772 "movq %%mm1, (%0,%2) \n\t" 773 "movq %%mm2, (%1) \n\t" 774 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), 775 "m"(alpha1), "m"(beta1), "m"(ff_bone) 776 ); 777} 778 779static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) 780{ 781 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); 782} 783 784static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) 785{ 786 //FIXME: could cut some load/stores by merging transpose with filter 787 DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; 788 transpose4x4(trans, pix-2, 8, stride); 789 transpose4x4(trans+4, pix-2+4*stride, 8, stride); 790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); 791 transpose4x4(pix-2, trans, stride, 8); 792 transpose4x4(pix-2+4*stride, trans+4, stride, 8); 793} 794 795static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], 796 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { 797 int dir; 798 __asm__ volatile( 799 "movq %0, %%mm7 \n" 800 "movq %1, %%mm6 \n" 801 ::"m"(ff_pb_1), "m"(ff_pb_3) 802 ); 803 if(field) 804 __asm__ volatile( 805 "movq %0, %%mm6 \n" 806 ::"m"(ff_pb_3_1) 807 ); 808 __asm__ volatile( 809 "movq %%mm6, %%mm5 \n" 810 "paddb %%mm5, %%mm5 \n" 811 :); 812 813 // could do a special case for dir==0 && edges==1, but it only reduces the 814 // average filter time by 1.2% 815 for( dir=1; dir>=0; dir-- ) { 816 const x86_reg d_idx = dir ? -8 : -1; 817 const int mask_mv = dir ? mask_mv1 : mask_mv0; 818 DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; 819 int b_idx, edge; 820 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { 821 __asm__ volatile( 822 "pand %0, %%mm0 \n\t" 823 ::"m"(mask_dir) 824 ); 825 if(!(mask_mv & edge)) { 826 if(bidir) { 827 __asm__ volatile( 828 "movd (%1,%0), %%mm2 \n" 829 "punpckldq 40(%1,%0), %%mm2 \n" // { ref0[bn], ref1[bn] } 830 "pshufw $0x44, (%1), %%mm0 \n" // { ref0[b], ref0[b] } 831 "pshufw $0x44, 40(%1), %%mm1 \n" // { ref1[b], ref1[b] } 832 "pshufw $0x4E, %%mm2, %%mm3 \n" 833 "psubb %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] } 834 "psubb %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] } 835 "1: \n" 836 "por %%mm1, %%mm0 \n" 837 "movq (%2,%0,4), %%mm1 \n" 838 "movq 8(%2,%0,4), %%mm2 \n" 839 "movq %%mm1, %%mm3 \n" 840 "movq %%mm2, %%mm4 \n" 841 "psubw (%2), %%mm1 \n" 842 "psubw 8(%2), %%mm2 \n" 843 "psubw 160(%2), %%mm3 \n" 844 "psubw 168(%2), %%mm4 \n" 845 "packsswb %%mm2, %%mm1 \n" 846 "packsswb %%mm4, %%mm3 \n" 847 "paddb %%mm6, %%mm1 \n" 848 "paddb %%mm6, %%mm3 \n" 849 "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit 850 "psubusb %%mm5, %%mm3 \n" 851 "packsswb %%mm3, %%mm1 \n" 852 "add $40, %0 \n" 853 "cmp $40, %0 \n" 854 "jl 1b \n" 855 "sub $80, %0 \n" 856 "pshufw $0x4E, %%mm1, %%mm1 \n" 857 "por %%mm1, %%mm0 \n" 858 "pshufw $0x4E, %%mm0, %%mm1 \n" 859 "pminub %%mm1, %%mm0 \n" 860 ::"r"(d_idx), 861 "r"(ref[0]+b_idx), 862 "r"(mv[0]+b_idx) 863 ); 864 } else { 865 __asm__ volatile( 866 "movd (%1), %%mm0 \n" 867 "psubb (%1,%0), %%mm0 \n" // ref[b] != ref[bn] 868 "movq (%2), %%mm1 \n" 869 "movq 8(%2), %%mm2 \n" 870 "psubw (%2,%0,4), %%mm1 \n" 871 "psubw 8(%2,%0,4), %%mm2 \n" 872 "packsswb %%mm2, %%mm1 \n" 873 "paddb %%mm6, %%mm1 \n" 874 "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit 875 "packsswb %%mm1, %%mm1 \n" 876 "por %%mm1, %%mm0 \n" 877 ::"r"(d_idx), 878 "r"(ref[0]+b_idx), 879 "r"(mv[0]+b_idx) 880 ); 881 } 882 } 883 __asm__ volatile( 884 "movd %0, %%mm1 \n" 885 "por %1, %%mm1 \n" // nnz[b] || nnz[bn] 886 ::"m"(nnz[b_idx]), 887 "m"(nnz[b_idx+d_idx]) 888 ); 889 __asm__ volatile( 890 "pminub %%mm7, %%mm1 \n" 891 "pminub %%mm7, %%mm0 \n" 892 "psllw $1, %%mm1 \n" 893 "pxor %%mm2, %%mm2 \n" 894 "pmaxub %%mm0, %%mm1 \n" 895 "punpcklbw %%mm2, %%mm1 \n" 896 "movq %%mm1, %0 \n" 897 :"=m"(*bS[dir][edge]) 898 ::"memory" 899 ); 900 } 901 edges = 4; 902 step = 1; 903 } 904 __asm__ volatile( 905 "movq (%0), %%mm0 \n\t" 906 "movq 8(%0), %%mm1 \n\t" 907 "movq 16(%0), %%mm2 \n\t" 908 "movq 24(%0), %%mm3 \n\t" 909 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) 910 "movq %%mm0, (%0) \n\t" 911 "movq %%mm3, 8(%0) \n\t" 912 "movq %%mm4, 16(%0) \n\t" 913 "movq %%mm2, 24(%0) \n\t" 914 ::"r"(bS[0]) 915 :"memory" 916 ); 917} 918 919/***********************************/ 920/* motion compensation */ 921 922#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\ 923 "mov"#q" "#C", "#T" \n\t"\ 924 "mov"#d" (%0), "#F" \n\t"\ 925 "paddw "#D", "#T" \n\t"\ 926 "psllw $2, "#T" \n\t"\ 927 "psubw "#B", "#T" \n\t"\ 928 "psubw "#E", "#T" \n\t"\ 929 "punpcklbw "#Z", "#F" \n\t"\ 930 "pmullw %4, "#T" \n\t"\ 931 "paddw %5, "#A" \n\t"\ 932 "add %2, %0 \n\t"\ 933 "paddw "#F", "#A" \n\t"\ 934 "paddw "#A", "#T" \n\t"\ 935 "psraw $5, "#T" \n\t"\ 936 "packuswb "#T", "#T" \n\t"\ 937 OP(T, (%1), A, d)\ 938 "add %3, %1 \n\t" 939 940#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\ 941 "mov"#q" "#C", "#T" \n\t"\ 942 "mov"#d" (%0), "#F" \n\t"\ 943 "paddw "#D", "#T" \n\t"\ 944 "psllw $2, "#T" \n\t"\ 945 "paddw %4, "#A" \n\t"\ 946 "psubw "#B", "#T" \n\t"\ 947 "psubw "#E", "#T" \n\t"\ 948 "punpcklbw "#Z", "#F" \n\t"\ 949 "pmullw %3, "#T" \n\t"\ 950 "paddw "#F", "#A" \n\t"\ 951 "add %2, %0 \n\t"\ 952 "paddw "#A", "#T" \n\t"\ 953 "mov"#q" "#T", "#OF"(%1) \n\t" 954 955#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q) 956#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q) 957#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa) 958#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa) 959 960 961#define QPEL_H264(OPNAME, OP, MMX)\ 962static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 963 int h=4;\ 964\ 965 __asm__ volatile(\ 966 "pxor %%mm7, %%mm7 \n\t"\ 967 "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\ 968 "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ 969 "1: \n\t"\ 970 "movd -1(%0), %%mm1 \n\t"\ 971 "movd (%0), %%mm2 \n\t"\ 972 "movd 1(%0), %%mm3 \n\t"\ 973 "movd 2(%0), %%mm0 \n\t"\ 974 "punpcklbw %%mm7, %%mm1 \n\t"\ 975 "punpcklbw %%mm7, %%mm2 \n\t"\ 976 "punpcklbw %%mm7, %%mm3 \n\t"\ 977 "punpcklbw %%mm7, %%mm0 \n\t"\ 978 "paddw %%mm0, %%mm1 \n\t"\ 979 "paddw %%mm3, %%mm2 \n\t"\ 980 "movd -2(%0), %%mm0 \n\t"\ 981 "movd 3(%0), %%mm3 \n\t"\ 982 "punpcklbw %%mm7, %%mm0 \n\t"\ 983 "punpcklbw %%mm7, %%mm3 \n\t"\ 984 "paddw %%mm3, %%mm0 \n\t"\ 985 "psllw $2, %%mm2 \n\t"\ 986 "psubw %%mm1, %%mm2 \n\t"\ 987 "pmullw %%mm4, %%mm2 \n\t"\ 988 "paddw %%mm5, %%mm0 \n\t"\ 989 "paddw %%mm2, %%mm0 \n\t"\ 990 "psraw $5, %%mm0 \n\t"\ 991 "packuswb %%mm0, %%mm0 \n\t"\ 992 OP(%%mm0, (%1),%%mm6, d)\ 993 "add %3, %0 \n\t"\ 994 "add %4, %1 \n\t"\ 995 "decl %2 \n\t"\ 996 " jnz 1b \n\t"\ 997 : "+a"(src), "+c"(dst), "+g"(h)\ 998 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ 999 : "memory"\ 1000 );\ 1001}\ 1002static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1003 int h=4;\ 1004 __asm__ volatile(\ 1005 "pxor %%mm7, %%mm7 \n\t"\ 1006 "movq %0, %%mm4 \n\t"\ 1007 "movq %1, %%mm5 \n\t"\ 1008 :: "m"(ff_pw_5), "m"(ff_pw_16)\ 1009 );\ 1010 do{\ 1011 __asm__ volatile(\ 1012 "movd -1(%0), %%mm1 \n\t"\ 1013 "movd (%0), %%mm2 \n\t"\ 1014 "movd 1(%0), %%mm3 \n\t"\ 1015 "movd 2(%0), %%mm0 \n\t"\ 1016 "punpcklbw %%mm7, %%mm1 \n\t"\ 1017 "punpcklbw %%mm7, %%mm2 \n\t"\ 1018 "punpcklbw %%mm7, %%mm3 \n\t"\ 1019 "punpcklbw %%mm7, %%mm0 \n\t"\ 1020 "paddw %%mm0, %%mm1 \n\t"\ 1021 "paddw %%mm3, %%mm2 \n\t"\ 1022 "movd -2(%0), %%mm0 \n\t"\ 1023 "movd 3(%0), %%mm3 \n\t"\ 1024 "punpcklbw %%mm7, %%mm0 \n\t"\ 1025 "punpcklbw %%mm7, %%mm3 \n\t"\ 1026 "paddw %%mm3, %%mm0 \n\t"\ 1027 "psllw $2, %%mm2 \n\t"\ 1028 "psubw %%mm1, %%mm2 \n\t"\ 1029 "pmullw %%mm4, %%mm2 \n\t"\ 1030 "paddw %%mm5, %%mm0 \n\t"\ 1031 "paddw %%mm2, %%mm0 \n\t"\ 1032 "movd (%2), %%mm3 \n\t"\ 1033 "psraw $5, %%mm0 \n\t"\ 1034 "packuswb %%mm0, %%mm0 \n\t"\ 1035 PAVGB" %%mm3, %%mm0 \n\t"\ 1036 OP(%%mm0, (%1),%%mm6, d)\ 1037 "add %4, %0 \n\t"\ 1038 "add %4, %1 \n\t"\ 1039 "add %3, %2 \n\t"\ 1040 : "+a"(src), "+c"(dst), "+d"(src2)\ 1041 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ 1042 : "memory"\ 1043 );\ 1044 }while(--h);\ 1045}\ 1046static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1047 src -= 2*srcStride;\ 1048 __asm__ volatile(\ 1049 "pxor %%mm7, %%mm7 \n\t"\ 1050 "movd (%0), %%mm0 \n\t"\ 1051 "add %2, %0 \n\t"\ 1052 "movd (%0), %%mm1 \n\t"\ 1053 "add %2, %0 \n\t"\ 1054 "movd (%0), %%mm2 \n\t"\ 1055 "add %2, %0 \n\t"\ 1056 "movd (%0), %%mm3 \n\t"\ 1057 "add %2, %0 \n\t"\ 1058 "movd (%0), %%mm4 \n\t"\ 1059 "add %2, %0 \n\t"\ 1060 "punpcklbw %%mm7, %%mm0 \n\t"\ 1061 "punpcklbw %%mm7, %%mm1 \n\t"\ 1062 "punpcklbw %%mm7, %%mm2 \n\t"\ 1063 "punpcklbw %%mm7, %%mm3 \n\t"\ 1064 "punpcklbw %%mm7, %%mm4 \n\t"\ 1065 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 1066 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 1067 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 1068 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 1069 \ 1070 : "+a"(src), "+c"(dst)\ 1071 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1072 : "memory"\ 1073 );\ 1074}\ 1075static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 1076 int h=4;\ 1077 int w=3;\ 1078 src -= 2*srcStride+2;\ 1079 while(w--){\ 1080 __asm__ volatile(\ 1081 "pxor %%mm7, %%mm7 \n\t"\ 1082 "movd (%0), %%mm0 \n\t"\ 1083 "add %2, %0 \n\t"\ 1084 "movd (%0), %%mm1 \n\t"\ 1085 "add %2, %0 \n\t"\ 1086 "movd (%0), %%mm2 \n\t"\ 1087 "add %2, %0 \n\t"\ 1088 "movd (%0), %%mm3 \n\t"\ 1089 "add %2, %0 \n\t"\ 1090 "movd (%0), %%mm4 \n\t"\ 1091 "add %2, %0 \n\t"\ 1092 "punpcklbw %%mm7, %%mm0 \n\t"\ 1093 "punpcklbw %%mm7, %%mm1 \n\t"\ 1094 "punpcklbw %%mm7, %%mm2 \n\t"\ 1095 "punpcklbw %%mm7, %%mm3 \n\t"\ 1096 "punpcklbw %%mm7, %%mm4 \n\t"\ 1097 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ 1098 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ 1099 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ 1100 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ 1101 \ 1102 : "+a"(src)\ 1103 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1104 : "memory"\ 1105 );\ 1106 tmp += 4;\ 1107 src += 4 - 9*srcStride;\ 1108 }\ 1109 tmp -= 3*4;\ 1110 __asm__ volatile(\ 1111 "1: \n\t"\ 1112 "movq (%0), %%mm0 \n\t"\ 1113 "paddw 10(%0), %%mm0 \n\t"\ 1114 "movq 2(%0), %%mm1 \n\t"\ 1115 "paddw 8(%0), %%mm1 \n\t"\ 1116 "movq 4(%0), %%mm2 \n\t"\ 1117 "paddw 6(%0), %%mm2 \n\t"\ 1118 "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ 1119 "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ 1120 "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ 1121 "paddsw %%mm2, %%mm0 \n\t"\ 1122 "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\ 1123 "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\ 1124 "psraw $6, %%mm0 \n\t"\ 1125 "packuswb %%mm0, %%mm0 \n\t"\ 1126 OP(%%mm0, (%1),%%mm7, d)\ 1127 "add $24, %0 \n\t"\ 1128 "add %3, %1 \n\t"\ 1129 "decl %2 \n\t"\ 1130 " jnz 1b \n\t"\ 1131 : "+a"(tmp), "+c"(dst), "+g"(h)\ 1132 : "S"((x86_reg)dstStride)\ 1133 : "memory"\ 1134 );\ 1135}\ 1136\ 1137static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1138 int h=8;\ 1139 __asm__ volatile(\ 1140 "pxor %%mm7, %%mm7 \n\t"\ 1141 "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\ 1142 "1: \n\t"\ 1143 "movq (%0), %%mm0 \n\t"\ 1144 "movq 1(%0), %%mm2 \n\t"\ 1145 "movq %%mm0, %%mm1 \n\t"\ 1146 "movq %%mm2, %%mm3 \n\t"\ 1147 "punpcklbw %%mm7, %%mm0 \n\t"\ 1148 "punpckhbw %%mm7, %%mm1 \n\t"\ 1149 "punpcklbw %%mm7, %%mm2 \n\t"\ 1150 "punpckhbw %%mm7, %%mm3 \n\t"\ 1151 "paddw %%mm2, %%mm0 \n\t"\ 1152 "paddw %%mm3, %%mm1 \n\t"\ 1153 "psllw $2, %%mm0 \n\t"\ 1154 "psllw $2, %%mm1 \n\t"\ 1155 "movq -1(%0), %%mm2 \n\t"\ 1156 "movq 2(%0), %%mm4 \n\t"\ 1157 "movq %%mm2, %%mm3 \n\t"\ 1158 "movq %%mm4, %%mm5 \n\t"\ 1159 "punpcklbw %%mm7, %%mm2 \n\t"\ 1160 "punpckhbw %%mm7, %%mm3 \n\t"\ 1161 "punpcklbw %%mm7, %%mm4 \n\t"\ 1162 "punpckhbw %%mm7, %%mm5 \n\t"\ 1163 "paddw %%mm4, %%mm2 \n\t"\ 1164 "paddw %%mm3, %%mm5 \n\t"\ 1165 "psubw %%mm2, %%mm0 \n\t"\ 1166 "psubw %%mm5, %%mm1 \n\t"\ 1167 "pmullw %%mm6, %%mm0 \n\t"\ 1168 "pmullw %%mm6, %%mm1 \n\t"\ 1169 "movd -2(%0), %%mm2 \n\t"\ 1170 "movd 7(%0), %%mm5 \n\t"\ 1171 "punpcklbw %%mm7, %%mm2 \n\t"\ 1172 "punpcklbw %%mm7, %%mm5 \n\t"\ 1173 "paddw %%mm3, %%mm2 \n\t"\ 1174 "paddw %%mm5, %%mm4 \n\t"\ 1175 "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ 1176 "paddw %%mm5, %%mm2 \n\t"\ 1177 "paddw %%mm5, %%mm4 \n\t"\ 1178 "paddw %%mm2, %%mm0 \n\t"\ 1179 "paddw %%mm4, %%mm1 \n\t"\ 1180 "psraw $5, %%mm0 \n\t"\ 1181 "psraw $5, %%mm1 \n\t"\ 1182 "packuswb %%mm1, %%mm0 \n\t"\ 1183 OP(%%mm0, (%1),%%mm5, q)\ 1184 "add %3, %0 \n\t"\ 1185 "add %4, %1 \n\t"\ 1186 "decl %2 \n\t"\ 1187 " jnz 1b \n\t"\ 1188 : "+a"(src), "+c"(dst), "+g"(h)\ 1189 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ 1190 : "memory"\ 1191 );\ 1192}\ 1193\ 1194static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1195 int h=8;\ 1196 __asm__ volatile(\ 1197 "pxor %%mm7, %%mm7 \n\t"\ 1198 "movq %0, %%mm6 \n\t"\ 1199 :: "m"(ff_pw_5)\ 1200 );\ 1201 do{\ 1202 __asm__ volatile(\ 1203 "movq (%0), %%mm0 \n\t"\ 1204 "movq 1(%0), %%mm2 \n\t"\ 1205 "movq %%mm0, %%mm1 \n\t"\ 1206 "movq %%mm2, %%mm3 \n\t"\ 1207 "punpcklbw %%mm7, %%mm0 \n\t"\ 1208 "punpckhbw %%mm7, %%mm1 \n\t"\ 1209 "punpcklbw %%mm7, %%mm2 \n\t"\ 1210 "punpckhbw %%mm7, %%mm3 \n\t"\ 1211 "paddw %%mm2, %%mm0 \n\t"\ 1212 "paddw %%mm3, %%mm1 \n\t"\ 1213 "psllw $2, %%mm0 \n\t"\ 1214 "psllw $2, %%mm1 \n\t"\ 1215 "movq -1(%0), %%mm2 \n\t"\ 1216 "movq 2(%0), %%mm4 \n\t"\ 1217 "movq %%mm2, %%mm3 \n\t"\ 1218 "movq %%mm4, %%mm5 \n\t"\ 1219 "punpcklbw %%mm7, %%mm2 \n\t"\ 1220 "punpckhbw %%mm7, %%mm3 \n\t"\ 1221 "punpcklbw %%mm7, %%mm4 \n\t"\ 1222 "punpckhbw %%mm7, %%mm5 \n\t"\ 1223 "paddw %%mm4, %%mm2 \n\t"\ 1224 "paddw %%mm3, %%mm5 \n\t"\ 1225 "psubw %%mm2, %%mm0 \n\t"\ 1226 "psubw %%mm5, %%mm1 \n\t"\ 1227 "pmullw %%mm6, %%mm0 \n\t"\ 1228 "pmullw %%mm6, %%mm1 \n\t"\ 1229 "movd -2(%0), %%mm2 \n\t"\ 1230 "movd 7(%0), %%mm5 \n\t"\ 1231 "punpcklbw %%mm7, %%mm2 \n\t"\ 1232 "punpcklbw %%mm7, %%mm5 \n\t"\ 1233 "paddw %%mm3, %%mm2 \n\t"\ 1234 "paddw %%mm5, %%mm4 \n\t"\ 1235 "movq %5, %%mm5 \n\t"\ 1236 "paddw %%mm5, %%mm2 \n\t"\ 1237 "paddw %%mm5, %%mm4 \n\t"\ 1238 "paddw %%mm2, %%mm0 \n\t"\ 1239 "paddw %%mm4, %%mm1 \n\t"\ 1240 "psraw $5, %%mm0 \n\t"\ 1241 "psraw $5, %%mm1 \n\t"\ 1242 "movq (%2), %%mm4 \n\t"\ 1243 "packuswb %%mm1, %%mm0 \n\t"\ 1244 PAVGB" %%mm4, %%mm0 \n\t"\ 1245 OP(%%mm0, (%1),%%mm5, q)\ 1246 "add %4, %0 \n\t"\ 1247 "add %4, %1 \n\t"\ 1248 "add %3, %2 \n\t"\ 1249 : "+a"(src), "+c"(dst), "+d"(src2)\ 1250 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ 1251 "m"(ff_pw_16)\ 1252 : "memory"\ 1253 );\ 1254 }while(--h);\ 1255}\ 1256\ 1257static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1258 int w= 2;\ 1259 src -= 2*srcStride;\ 1260 \ 1261 while(w--){\ 1262 __asm__ volatile(\ 1263 "pxor %%mm7, %%mm7 \n\t"\ 1264 "movd (%0), %%mm0 \n\t"\ 1265 "add %2, %0 \n\t"\ 1266 "movd (%0), %%mm1 \n\t"\ 1267 "add %2, %0 \n\t"\ 1268 "movd (%0), %%mm2 \n\t"\ 1269 "add %2, %0 \n\t"\ 1270 "movd (%0), %%mm3 \n\t"\ 1271 "add %2, %0 \n\t"\ 1272 "movd (%0), %%mm4 \n\t"\ 1273 "add %2, %0 \n\t"\ 1274 "punpcklbw %%mm7, %%mm0 \n\t"\ 1275 "punpcklbw %%mm7, %%mm1 \n\t"\ 1276 "punpcklbw %%mm7, %%mm2 \n\t"\ 1277 "punpcklbw %%mm7, %%mm3 \n\t"\ 1278 "punpcklbw %%mm7, %%mm4 \n\t"\ 1279 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 1280 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 1281 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 1282 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 1283 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ 1284 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ 1285 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 1286 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 1287 \ 1288 : "+a"(src), "+c"(dst)\ 1289 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1290 : "memory"\ 1291 );\ 1292 if(h==16){\ 1293 __asm__ volatile(\ 1294 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 1295 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 1296 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ 1297 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ 1298 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 1299 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 1300 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 1301 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 1302 \ 1303 : "+a"(src), "+c"(dst)\ 1304 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1305 : "memory"\ 1306 );\ 1307 }\ 1308 src += 4-(h+5)*srcStride;\ 1309 dst += 4-h*dstStride;\ 1310 }\ 1311}\ 1312static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ 1313 int w = (size+8)>>2;\ 1314 src -= 2*srcStride+2;\ 1315 while(w--){\ 1316 __asm__ volatile(\ 1317 "pxor %%mm7, %%mm7 \n\t"\ 1318 "movd (%0), %%mm0 \n\t"\ 1319 "add %2, %0 \n\t"\ 1320 "movd (%0), %%mm1 \n\t"\ 1321 "add %2, %0 \n\t"\ 1322 "movd (%0), %%mm2 \n\t"\ 1323 "add %2, %0 \n\t"\ 1324 "movd (%0), %%mm3 \n\t"\ 1325 "add %2, %0 \n\t"\ 1326 "movd (%0), %%mm4 \n\t"\ 1327 "add %2, %0 \n\t"\ 1328 "punpcklbw %%mm7, %%mm0 \n\t"\ 1329 "punpcklbw %%mm7, %%mm1 \n\t"\ 1330 "punpcklbw %%mm7, %%mm2 \n\t"\ 1331 "punpcklbw %%mm7, %%mm3 \n\t"\ 1332 "punpcklbw %%mm7, %%mm4 \n\t"\ 1333 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\ 1334 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\ 1335 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\ 1336 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\ 1337 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\ 1338 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\ 1339 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\ 1340 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\ 1341 : "+a"(src)\ 1342 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1343 : "memory"\ 1344 );\ 1345 if(size==16){\ 1346 __asm__ volatile(\ 1347 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ 1348 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ 1349 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ 1350 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ 1351 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ 1352 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\ 1353 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\ 1354 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\ 1355 : "+a"(src)\ 1356 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1357 : "memory"\ 1358 );\ 1359 }\ 1360 tmp += 4;\ 1361 src += 4 - (size+5)*srcStride;\ 1362 }\ 1363}\ 1364static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ 1365 int w = size>>4;\ 1366 do{\ 1367 int h = size;\ 1368 __asm__ volatile(\ 1369 "1: \n\t"\ 1370 "movq (%0), %%mm0 \n\t"\ 1371 "movq 8(%0), %%mm3 \n\t"\ 1372 "movq 2(%0), %%mm1 \n\t"\ 1373 "movq 10(%0), %%mm4 \n\t"\ 1374 "paddw %%mm4, %%mm0 \n\t"\ 1375 "paddw %%mm3, %%mm1 \n\t"\ 1376 "paddw 18(%0), %%mm3 \n\t"\ 1377 "paddw 16(%0), %%mm4 \n\t"\ 1378 "movq 4(%0), %%mm2 \n\t"\ 1379 "movq 12(%0), %%mm5 \n\t"\ 1380 "paddw 6(%0), %%mm2 \n\t"\ 1381 "paddw 14(%0), %%mm5 \n\t"\ 1382 "psubw %%mm1, %%mm0 \n\t"\ 1383 "psubw %%mm4, %%mm3 \n\t"\ 1384 "psraw $2, %%mm0 \n\t"\ 1385 "psraw $2, %%mm3 \n\t"\ 1386 "psubw %%mm1, %%mm0 \n\t"\ 1387 "psubw %%mm4, %%mm3 \n\t"\ 1388 "paddsw %%mm2, %%mm0 \n\t"\ 1389 "paddsw %%mm5, %%mm3 \n\t"\ 1390 "psraw $2, %%mm0 \n\t"\ 1391 "psraw $2, %%mm3 \n\t"\ 1392 "paddw %%mm2, %%mm0 \n\t"\ 1393 "paddw %%mm5, %%mm3 \n\t"\ 1394 "psraw $6, %%mm0 \n\t"\ 1395 "psraw $6, %%mm3 \n\t"\ 1396 "packuswb %%mm3, %%mm0 \n\t"\ 1397 OP(%%mm0, (%1),%%mm7, q)\ 1398 "add $48, %0 \n\t"\ 1399 "add %3, %1 \n\t"\ 1400 "decl %2 \n\t"\ 1401 " jnz 1b \n\t"\ 1402 : "+a"(tmp), "+c"(dst), "+g"(h)\ 1403 : "S"((x86_reg)dstStride)\ 1404 : "memory"\ 1405 );\ 1406 tmp += 8 - size*24;\ 1407 dst += 8 - size*dstStride;\ 1408 }while(w--);\ 1409}\ 1410\ 1411static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1412 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ 1413}\ 1414static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1415 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ 1416 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ 1417}\ 1418\ 1419static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1420 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 1421 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 1422 src += 8*srcStride;\ 1423 dst += 8*dstStride;\ 1424 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 1425 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 1426}\ 1427\ 1428static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1429 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 1430 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 1431 src += 8*dstStride;\ 1432 dst += 8*dstStride;\ 1433 src2 += 8*src2Stride;\ 1434 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 1435 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 1436}\ 1437\ 1438static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ 1439 put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ 1440 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ 1441}\ 1442static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 1443 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ 1444}\ 1445\ 1446static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 1447 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ 1448}\ 1449\ 1450static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ 1451{\ 1452 __asm__ volatile(\ 1453 "movq (%1), %%mm0 \n\t"\ 1454 "movq 24(%1), %%mm1 \n\t"\ 1455 "psraw $5, %%mm0 \n\t"\ 1456 "psraw $5, %%mm1 \n\t"\ 1457 "packuswb %%mm0, %%mm0 \n\t"\ 1458 "packuswb %%mm1, %%mm1 \n\t"\ 1459 PAVGB" (%0), %%mm0 \n\t"\ 1460 PAVGB" (%0,%3), %%mm1 \n\t"\ 1461 OP(%%mm0, (%2), %%mm4, d)\ 1462 OP(%%mm1, (%2,%4), %%mm5, d)\ 1463 "lea (%0,%3,2), %0 \n\t"\ 1464 "lea (%2,%4,2), %2 \n\t"\ 1465 "movq 48(%1), %%mm0 \n\t"\ 1466 "movq 72(%1), %%mm1 \n\t"\ 1467 "psraw $5, %%mm0 \n\t"\ 1468 "psraw $5, %%mm1 \n\t"\ 1469 "packuswb %%mm0, %%mm0 \n\t"\ 1470 "packuswb %%mm1, %%mm1 \n\t"\ 1471 PAVGB" (%0), %%mm0 \n\t"\ 1472 PAVGB" (%0,%3), %%mm1 \n\t"\ 1473 OP(%%mm0, (%2), %%mm4, d)\ 1474 OP(%%mm1, (%2,%4), %%mm5, d)\ 1475 :"+a"(src8), "+c"(src16), "+d"(dst)\ 1476 :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\ 1477 :"memory");\ 1478}\ 1479static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ 1480{\ 1481 do{\ 1482 __asm__ volatile(\ 1483 "movq (%1), %%mm0 \n\t"\ 1484 "movq 8(%1), %%mm1 \n\t"\ 1485 "movq 48(%1), %%mm2 \n\t"\ 1486 "movq 8+48(%1), %%mm3 \n\t"\ 1487 "psraw $5, %%mm0 \n\t"\ 1488 "psraw $5, %%mm1 \n\t"\ 1489 "psraw $5, %%mm2 \n\t"\ 1490 "psraw $5, %%mm3 \n\t"\ 1491 "packuswb %%mm1, %%mm0 \n\t"\ 1492 "packuswb %%mm3, %%mm2 \n\t"\ 1493 PAVGB" (%0), %%mm0 \n\t"\ 1494 PAVGB" (%0,%3), %%mm2 \n\t"\ 1495 OP(%%mm0, (%2), %%mm5, q)\ 1496 OP(%%mm2, (%2,%4), %%mm5, q)\ 1497 ::"a"(src8), "c"(src16), "d"(dst),\ 1498 "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\ 1499 :"memory");\ 1500 src8 += 2L*src8Stride;\ 1501 src16 += 48;\ 1502 dst += 2L*dstStride;\ 1503 }while(h-=2);\ 1504}\ 1505static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ 1506{\ 1507 OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ 1508 OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ 1509}\ 1510 1511 1512#if ARCH_X86_64 1513#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 1514static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1515 int h=16;\ 1516 __asm__ volatile(\ 1517 "pxor %%xmm15, %%xmm15 \n\t"\ 1518 "movdqa %6, %%xmm14 \n\t"\ 1519 "movdqa %7, %%xmm13 \n\t"\ 1520 "1: \n\t"\ 1521 "lddqu 6(%0), %%xmm1 \n\t"\ 1522 "lddqu -2(%0), %%xmm7 \n\t"\ 1523 "movdqa %%xmm1, %%xmm0 \n\t"\ 1524 "punpckhbw %%xmm15, %%xmm1 \n\t"\ 1525 "punpcklbw %%xmm15, %%xmm0 \n\t"\ 1526 "punpcklbw %%xmm15, %%xmm7 \n\t"\ 1527 "movdqa %%xmm1, %%xmm2 \n\t"\ 1528 "movdqa %%xmm0, %%xmm6 \n\t"\ 1529 "movdqa %%xmm1, %%xmm3 \n\t"\ 1530 "movdqa %%xmm0, %%xmm8 \n\t"\ 1531 "movdqa %%xmm1, %%xmm4 \n\t"\ 1532 "movdqa %%xmm0, %%xmm9 \n\t"\ 1533 "movdqa %%xmm0, %%xmm12 \n\t"\ 1534 "movdqa %%xmm1, %%xmm11 \n\t"\ 1535 "palignr $10,%%xmm0, %%xmm11\n\t"\ 1536 "palignr $10,%%xmm7, %%xmm12\n\t"\ 1537 "palignr $2, %%xmm0, %%xmm4 \n\t"\ 1538 "palignr $2, %%xmm7, %%xmm9 \n\t"\ 1539 "palignr $4, %%xmm0, %%xmm3 \n\t"\ 1540 "palignr $4, %%xmm7, %%xmm8 \n\t"\ 1541 "palignr $6, %%xmm0, %%xmm2 \n\t"\ 1542 "palignr $6, %%xmm7, %%xmm6 \n\t"\ 1543 "paddw %%xmm0 ,%%xmm11 \n\t"\ 1544 "palignr $8, %%xmm0, %%xmm1 \n\t"\ 1545 "palignr $8, %%xmm7, %%xmm0 \n\t"\ 1546 "paddw %%xmm12,%%xmm7 \n\t"\ 1547 "paddw %%xmm3, %%xmm2 \n\t"\ 1548 "paddw %%xmm8, %%xmm6 \n\t"\ 1549 "paddw %%xmm4, %%xmm1 \n\t"\ 1550 "paddw %%xmm9, %%xmm0 \n\t"\ 1551 "psllw $2, %%xmm2 \n\t"\ 1552 "psllw $2, %%xmm6 \n\t"\ 1553 "psubw %%xmm1, %%xmm2 \n\t"\ 1554 "psubw %%xmm0, %%xmm6 \n\t"\ 1555 "paddw %%xmm13,%%xmm11 \n\t"\ 1556 "paddw %%xmm13,%%xmm7 \n\t"\ 1557 "pmullw %%xmm14,%%xmm2 \n\t"\ 1558 "pmullw %%xmm14,%%xmm6 \n\t"\ 1559 "lddqu (%2), %%xmm3 \n\t"\ 1560 "paddw %%xmm11,%%xmm2 \n\t"\ 1561 "paddw %%xmm7, %%xmm6 \n\t"\ 1562 "psraw $5, %%xmm2 \n\t"\ 1563 "psraw $5, %%xmm6 \n\t"\ 1564 "packuswb %%xmm2,%%xmm6 \n\t"\ 1565 "pavgb %%xmm3, %%xmm6 \n\t"\ 1566 OP(%%xmm6, (%1), %%xmm4, dqa)\ 1567 "add %5, %0 \n\t"\ 1568 "add %5, %1 \n\t"\ 1569 "add %4, %2 \n\t"\ 1570 "decl %3 \n\t"\ 1571 "jg 1b \n\t"\ 1572 : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ 1573 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ 1574 "m"(ff_pw_5), "m"(ff_pw_16)\ 1575 : "memory"\ 1576 );\ 1577} 1578#else // ARCH_X86_64 1579#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 1580static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1581 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 1582 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 1583 src += 8*dstStride;\ 1584 dst += 8*dstStride;\ 1585 src2 += 8*src2Stride;\ 1586 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 1587 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 1588} 1589#endif // ARCH_X86_64 1590 1591#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ 1592static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1593 int h=8;\ 1594 __asm__ volatile(\ 1595 "pxor %%xmm7, %%xmm7 \n\t"\ 1596 "movdqa %0, %%xmm6 \n\t"\ 1597 :: "m"(ff_pw_5)\ 1598 );\ 1599 do{\ 1600 __asm__ volatile(\ 1601 "lddqu -2(%0), %%xmm1 \n\t"\ 1602 "movdqa %%xmm1, %%xmm0 \n\t"\ 1603 "punpckhbw %%xmm7, %%xmm1 \n\t"\ 1604 "punpcklbw %%xmm7, %%xmm0 \n\t"\ 1605 "movdqa %%xmm1, %%xmm2 \n\t"\ 1606 "movdqa %%xmm1, %%xmm3 \n\t"\ 1607 "movdqa %%xmm1, %%xmm4 \n\t"\ 1608 "movdqa %%xmm1, %%xmm5 \n\t"\ 1609 "palignr $2, %%xmm0, %%xmm4 \n\t"\ 1610 "palignr $4, %%xmm0, %%xmm3 \n\t"\ 1611 "palignr $6, %%xmm0, %%xmm2 \n\t"\ 1612 "palignr $8, %%xmm0, %%xmm1 \n\t"\ 1613 "palignr $10,%%xmm0, %%xmm5 \n\t"\ 1614 "paddw %%xmm5, %%xmm0 \n\t"\ 1615 "paddw %%xmm3, %%xmm2 \n\t"\ 1616 "paddw %%xmm4, %%xmm1 \n\t"\ 1617 "psllw $2, %%xmm2 \n\t"\ 1618 "movq (%2), %%xmm3 \n\t"\ 1619 "psubw %%xmm1, %%xmm2 \n\t"\ 1620 "paddw %5, %%xmm0 \n\t"\ 1621 "pmullw %%xmm6, %%xmm2 \n\t"\ 1622 "paddw %%xmm0, %%xmm2 \n\t"\ 1623 "psraw $5, %%xmm2 \n\t"\ 1624 "packuswb %%xmm2, %%xmm2 \n\t"\ 1625 "pavgb %%xmm3, %%xmm2 \n\t"\ 1626 OP(%%xmm2, (%1), %%xmm4, q)\ 1627 "add %4, %0 \n\t"\ 1628 "add %4, %1 \n\t"\ 1629 "add %3, %2 \n\t"\ 1630 : "+a"(src), "+c"(dst), "+d"(src2)\ 1631 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ 1632 "m"(ff_pw_16)\ 1633 : "memory"\ 1634 );\ 1635 }while(--h);\ 1636}\ 1637QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 1638\ 1639static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1640 int h=8;\ 1641 __asm__ volatile(\ 1642 "pxor %%xmm7, %%xmm7 \n\t"\ 1643 "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\ 1644 "1: \n\t"\ 1645 "lddqu -2(%0), %%xmm1 \n\t"\ 1646 "movdqa %%xmm1, %%xmm0 \n\t"\ 1647 "punpckhbw %%xmm7, %%xmm1 \n\t"\ 1648 "punpcklbw %%xmm7, %%xmm0 \n\t"\ 1649 "movdqa %%xmm1, %%xmm2 \n\t"\ 1650 "movdqa %%xmm1, %%xmm3 \n\t"\ 1651 "movdqa %%xmm1, %%xmm4 \n\t"\ 1652 "movdqa %%xmm1, %%xmm5 \n\t"\ 1653 "palignr $2, %%xmm0, %%xmm4 \n\t"\ 1654 "palignr $4, %%xmm0, %%xmm3 \n\t"\ 1655 "palignr $6, %%xmm0, %%xmm2 \n\t"\ 1656 "palignr $8, %%xmm0, %%xmm1 \n\t"\ 1657 "palignr $10,%%xmm0, %%xmm5 \n\t"\ 1658 "paddw %%xmm5, %%xmm0 \n\t"\ 1659 "paddw %%xmm3, %%xmm2 \n\t"\ 1660 "paddw %%xmm4, %%xmm1 \n\t"\ 1661 "psllw $2, %%xmm2 \n\t"\ 1662 "psubw %%xmm1, %%xmm2 \n\t"\ 1663 "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\ 1664 "pmullw %%xmm6, %%xmm2 \n\t"\ 1665 "paddw %%xmm0, %%xmm2 \n\t"\ 1666 "psraw $5, %%xmm2 \n\t"\ 1667 "packuswb %%xmm2, %%xmm2 \n\t"\ 1668 OP(%%xmm2, (%1), %%xmm4, q)\ 1669 "add %3, %0 \n\t"\ 1670 "add %4, %1 \n\t"\ 1671 "decl %2 \n\t"\ 1672 " jnz 1b \n\t"\ 1673 : "+a"(src), "+c"(dst), "+g"(h)\ 1674 : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ 1675 : "memory"\ 1676 );\ 1677}\ 1678static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1679 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 1680 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 1681 src += 8*srcStride;\ 1682 dst += 8*dstStride;\ 1683 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 1684 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 1685}\ 1686 1687#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ 1688static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1689 src -= 2*srcStride;\ 1690 \ 1691 __asm__ volatile(\ 1692 "pxor %%xmm7, %%xmm7 \n\t"\ 1693 "movq (%0), %%xmm0 \n\t"\ 1694 "add %2, %0 \n\t"\ 1695 "movq (%0), %%xmm1 \n\t"\ 1696 "add %2, %0 \n\t"\ 1697 "movq (%0), %%xmm2 \n\t"\ 1698 "add %2, %0 \n\t"\ 1699 "movq (%0), %%xmm3 \n\t"\ 1700 "add %2, %0 \n\t"\ 1701 "movq (%0), %%xmm4 \n\t"\ 1702 "add %2, %0 \n\t"\ 1703 "punpcklbw %%xmm7, %%xmm0 \n\t"\ 1704 "punpcklbw %%xmm7, %%xmm1 \n\t"\ 1705 "punpcklbw %%xmm7, %%xmm2 \n\t"\ 1706 "punpcklbw %%xmm7, %%xmm3 \n\t"\ 1707 "punpcklbw %%xmm7, %%xmm4 \n\t"\ 1708 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ 1709 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ 1710 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ 1711 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ 1712 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ 1713 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ 1714 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ 1715 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ 1716 \ 1717 : "+a"(src), "+c"(dst)\ 1718 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1719 : "memory"\ 1720 );\ 1721 if(h==16){\ 1722 __asm__ volatile(\ 1723 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ 1724 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ 1725 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ 1726 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ 1727 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ 1728 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ 1729 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ 1730 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ 1731 \ 1732 : "+a"(src), "+c"(dst)\ 1733 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1734 : "memory"\ 1735 );\ 1736 }\ 1737}\ 1738static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1739 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ 1740}\ 1741static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1742 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ 1743 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ 1744} 1745 1746static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ 1747 int w = (size+8)>>3; 1748 src -= 2*srcStride+2; 1749 while(w--){ 1750 __asm__ volatile( 1751 "pxor %%xmm7, %%xmm7 \n\t" 1752 "movq (%0), %%xmm0 \n\t" 1753 "add %2, %0 \n\t" 1754 "movq (%0), %%xmm1 \n\t" 1755 "add %2, %0 \n\t" 1756 "movq (%0), %%xmm2 \n\t" 1757 "add %2, %0 \n\t" 1758 "movq (%0), %%xmm3 \n\t" 1759 "add %2, %0 \n\t" 1760 "movq (%0), %%xmm4 \n\t" 1761 "add %2, %0 \n\t" 1762 "punpcklbw %%xmm7, %%xmm0 \n\t" 1763 "punpcklbw %%xmm7, %%xmm1 \n\t" 1764 "punpcklbw %%xmm7, %%xmm2 \n\t" 1765 "punpcklbw %%xmm7, %%xmm3 \n\t" 1766 "punpcklbw %%xmm7, %%xmm4 \n\t" 1767 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) 1768 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) 1769 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) 1770 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) 1771 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) 1772 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) 1773 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) 1774 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) 1775 : "+a"(src) 1776 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) 1777 : "memory" 1778 ); 1779 if(size==16){ 1780 __asm__ volatile( 1781 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) 1782 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) 1783 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) 1784 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) 1785 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) 1786 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) 1787 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) 1788 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) 1789 : "+a"(src) 1790 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) 1791 : "memory" 1792 ); 1793 } 1794 tmp += 8; 1795 src += 8 - (size+5)*srcStride; 1796 } 1797} 1798 1799#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ 1800static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ 1801 int h = size;\ 1802 if(size == 16){\ 1803 __asm__ volatile(\ 1804 "1: \n\t"\ 1805 "movdqa 32(%0), %%xmm4 \n\t"\ 1806 "movdqa 16(%0), %%xmm5 \n\t"\ 1807 "movdqa (%0), %%xmm7 \n\t"\ 1808 "movdqa %%xmm4, %%xmm3 \n\t"\ 1809 "movdqa %%xmm4, %%xmm2 \n\t"\ 1810 "movdqa %%xmm4, %%xmm1 \n\t"\ 1811 "movdqa %%xmm4, %%xmm0 \n\t"\ 1812 "palignr $10, %%xmm5, %%xmm0 \n\t"\ 1813 "palignr $8, %%xmm5, %%xmm1 \n\t"\ 1814 "palignr $6, %%xmm5, %%xmm2 \n\t"\ 1815 "palignr $4, %%xmm5, %%xmm3 \n\t"\ 1816 "palignr $2, %%xmm5, %%xmm4 \n\t"\ 1817 "paddw %%xmm5, %%xmm0 \n\t"\ 1818 "paddw %%xmm4, %%xmm1 \n\t"\ 1819 "paddw %%xmm3, %%xmm2 \n\t"\ 1820 "movdqa %%xmm5, %%xmm6 \n\t"\ 1821 "movdqa %%xmm5, %%xmm4 \n\t"\ 1822 "movdqa %%xmm5, %%xmm3 \n\t"\ 1823 "palignr $8, %%xmm7, %%xmm4 \n\t"\ 1824 "palignr $2, %%xmm7, %%xmm6 \n\t"\ 1825 "palignr $10, %%xmm7, %%xmm3 \n\t"\ 1826 "paddw %%xmm6, %%xmm4 \n\t"\ 1827 "movdqa %%xmm5, %%xmm6 \n\t"\ 1828 "palignr $6, %%xmm7, %%xmm5 \n\t"\ 1829 "palignr $4, %%xmm7, %%xmm6 \n\t"\ 1830 "paddw %%xmm7, %%xmm3 \n\t"\ 1831 "paddw %%xmm6, %%xmm5 \n\t"\ 1832 \ 1833 "psubw %%xmm1, %%xmm0 \n\t"\ 1834 "psubw %%xmm4, %%xmm3 \n\t"\ 1835 "psraw $2, %%xmm0 \n\t"\ 1836 "psraw $2, %%xmm3 \n\t"\ 1837 "psubw %%xmm1, %%xmm0 \n\t"\ 1838 "psubw %%xmm4, %%xmm3 \n\t"\ 1839 "paddw %%xmm2, %%xmm0 \n\t"\ 1840 "paddw %%xmm5, %%xmm3 \n\t"\ 1841 "psraw $2, %%xmm0 \n\t"\ 1842 "psraw $2, %%xmm3 \n\t"\ 1843 "paddw %%xmm2, %%xmm0 \n\t"\ 1844 "paddw %%xmm5, %%xmm3 \n\t"\ 1845 "psraw $6, %%xmm0 \n\t"\ 1846 "psraw $6, %%xmm3 \n\t"\ 1847 "packuswb %%xmm0, %%xmm3 \n\t"\ 1848 OP(%%xmm3, (%1), %%xmm7, dqa)\ 1849 "add $48, %0 \n\t"\ 1850 "add %3, %1 \n\t"\ 1851 "decl %2 \n\t"\ 1852 " jnz 1b \n\t"\ 1853 : "+a"(tmp), "+c"(dst), "+g"(h)\ 1854 : "S"((x86_reg)dstStride)\ 1855 : "memory"\ 1856 );\ 1857 }else{\ 1858 __asm__ volatile(\ 1859 "1: \n\t"\ 1860 "movdqa 16(%0), %%xmm1 \n\t"\ 1861 "movdqa (%0), %%xmm0 \n\t"\ 1862 "movdqa %%xmm1, %%xmm2 \n\t"\ 1863 "movdqa %%xmm1, %%xmm3 \n\t"\ 1864 "movdqa %%xmm1, %%xmm4 \n\t"\ 1865 "movdqa %%xmm1, %%xmm5 \n\t"\ 1866 "palignr $10, %%xmm0, %%xmm5 \n\t"\ 1867 "palignr $8, %%xmm0, %%xmm4 \n\t"\ 1868 "palignr $6, %%xmm0, %%xmm3 \n\t"\ 1869 "palignr $4, %%xmm0, %%xmm2 \n\t"\ 1870 "palignr $2, %%xmm0, %%xmm1 \n\t"\ 1871 "paddw %%xmm5, %%xmm0 \n\t"\ 1872 "paddw %%xmm4, %%xmm1 \n\t"\ 1873 "paddw %%xmm3, %%xmm2 \n\t"\ 1874 "psubw %%xmm1, %%xmm0 \n\t"\ 1875 "psraw $2, %%xmm0 \n\t"\ 1876 "psubw %%xmm1, %%xmm0 \n\t"\ 1877 "paddw %%xmm2, %%xmm0 \n\t"\ 1878 "psraw $2, %%xmm0 \n\t"\ 1879 "paddw %%xmm2, %%xmm0 \n\t"\ 1880 "psraw $6, %%xmm0 \n\t"\ 1881 "packuswb %%xmm0, %%xmm0 \n\t"\ 1882 OP(%%xmm0, (%1), %%xmm7, q)\ 1883 "add $48, %0 \n\t"\ 1884 "add %3, %1 \n\t"\ 1885 "decl %2 \n\t"\ 1886 " jnz 1b \n\t"\ 1887 : "+a"(tmp), "+c"(dst), "+g"(h)\ 1888 : "S"((x86_reg)dstStride)\ 1889 : "memory"\ 1890 );\ 1891 }\ 1892} 1893 1894#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ 1895static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ 1896 put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ 1897 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ 1898}\ 1899static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 1900 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ 1901}\ 1902static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 1903 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ 1904}\ 1905 1906#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2 1907#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2 1908#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2 1909#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2 1910#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2 1911#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2 1912#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2 1913#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2 1914 1915#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2 1916#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2 1917#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2 1918#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2 1919#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2 1920#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2 1921#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2 1922#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2 1923 1924#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2 1925#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2 1926#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2 1927#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2 1928 1929#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2 1930#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2 1931#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2 1932#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2 1933 1934#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2 1935#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2 1936 1937#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ 1938H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ 1939H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ 1940H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ 1941H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ 1942 1943static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ 1944 put_pixels16_sse2(dst, src, stride, 16); 1945} 1946static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ 1947 avg_pixels16_sse2(dst, src, stride, 16); 1948} 1949#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2 1950#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2 1951 1952#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ 1953static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ 1954 OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ 1955}\ 1956 1957#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ 1958static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1959 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ 1960}\ 1961\ 1962static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1963 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ 1964}\ 1965\ 1966static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1967 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ 1968}\ 1969 1970#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ 1971static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1972 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 1973 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 1974 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ 1975}\ 1976\ 1977static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1978 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ 1979}\ 1980\ 1981static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1982 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 1983 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 1984 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ 1985}\ 1986 1987#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ 1988static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1989 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 1990 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 1991 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ 1992}\ 1993\ 1994static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 1995 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 1996 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ 1997 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ 1998}\ 1999\ 2000static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 2001 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 2002 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 2003 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ 2004}\ 2005\ 2006static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 2007 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 2008 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ 2009 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ 2010}\ 2011\ 2012static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 2013 DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ 2014 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ 2015}\ 2016\ 2017static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 2018 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ 2019 uint8_t * const halfHV= temp;\ 2020 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 2021 assert(((int)temp & 7) == 0);\ 2022 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 2023 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ 2024}\ 2025\ 2026static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 2027 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ 2028 uint8_t * const halfHV= temp;\ 2029 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 2030 assert(((int)temp & 7) == 0);\ 2031 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 2032 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ 2033}\ 2034\ 2035static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 2036 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ 2037 uint8_t * const halfHV= temp;\ 2038 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 2039 assert(((int)temp & 7) == 0);\ 2040 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 2041 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ 2042}\ 2043\ 2044static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 2045 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ 2046 uint8_t * const halfHV= temp;\ 2047 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 2048 assert(((int)temp & 7) == 0);\ 2049 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 2050 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ 2051}\ 2052 2053#define H264_MC_4816(MMX)\ 2054H264_MC(put_, 4, MMX, 8)\ 2055H264_MC(put_, 8, MMX, 8)\ 2056H264_MC(put_, 16,MMX, 8)\ 2057H264_MC(avg_, 4, MMX, 8)\ 2058H264_MC(avg_, 8, MMX, 8)\ 2059H264_MC(avg_, 16,MMX, 8)\ 2060 2061#define H264_MC_816(QPEL, XMM)\ 2062QPEL(put_, 8, XMM, 16)\ 2063QPEL(put_, 16,XMM, 16)\ 2064QPEL(avg_, 8, XMM, 16)\ 2065QPEL(avg_, 16,XMM, 16)\ 2066 2067 2068#define AVG_3DNOW_OP(a,b,temp, size) \ 2069"mov" #size " " #b ", " #temp " \n\t"\ 2070"pavgusb " #temp ", " #a " \n\t"\ 2071"mov" #size " " #a ", " #b " \n\t" 2072#define AVG_MMX2_OP(a,b,temp, size) \ 2073"mov" #size " " #b ", " #temp " \n\t"\ 2074"pavgb " #temp ", " #a " \n\t"\ 2075"mov" #size " " #a ", " #b " \n\t" 2076 2077#define PAVGB "pavgusb" 2078QPEL_H264(put_, PUT_OP, 3dnow) 2079QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) 2080#undef PAVGB 2081#define PAVGB "pavgb" 2082QPEL_H264(put_, PUT_OP, mmx2) 2083QPEL_H264(avg_, AVG_MMX2_OP, mmx2) 2084QPEL_H264_V_XMM(put_, PUT_OP, sse2) 2085QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2) 2086QPEL_H264_HV_XMM(put_, PUT_OP, sse2) 2087QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2) 2088#if HAVE_SSSE3 2089QPEL_H264_H_XMM(put_, PUT_OP, ssse3) 2090QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3) 2091QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) 2092QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3) 2093QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) 2094QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) 2095#endif 2096#undef PAVGB 2097 2098H264_MC_4816(3dnow) 2099H264_MC_4816(mmx2) 2100H264_MC_816(H264_MC_V, sse2) 2101H264_MC_816(H264_MC_HV, sse2) 2102#if HAVE_SSSE3 2103H264_MC_816(H264_MC_H, ssse3) 2104H264_MC_816(H264_MC_HV, ssse3) 2105#endif 2106 2107/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */ 2108DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = { 2109 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL 2110}; 2111 2112#define H264_CHROMA_OP(S,D) 2113#define H264_CHROMA_OP4(S,D,T) 2114#define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx 2115#define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx 2116#define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2 2117#define H264_CHROMA_MC8_MV0 put_pixels8_mmx 2118#include "dsputil_h264_template_mmx.c" 2119 2120static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2121{ 2122 put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg); 2123} 2124static void put_vc1_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2125{ 2126 put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg+2); 2127} 2128static void put_h264_chroma_mc4_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2129{ 2130 put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, h264_rnd_reg); 2131} 2132 2133#undef H264_CHROMA_OP 2134#undef H264_CHROMA_OP4 2135#undef H264_CHROMA_MC8_TMPL 2136#undef H264_CHROMA_MC4_TMPL 2137#undef H264_CHROMA_MC2_TMPL 2138#undef H264_CHROMA_MC8_MV0 2139 2140#define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t" 2141#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\ 2142 "pavgb " #T ", " #D " \n\t" 2143#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_mmx2 2144#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_mmx2 2145#define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2 2146#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 2147#include "dsputil_h264_template_mmx.c" 2148static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2149{ 2150 avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg); 2151} 2152static void avg_vc1_chroma_mc8_mmx2_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2153{ 2154 avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg+2); 2155} 2156static void avg_h264_chroma_mc4_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2157{ 2158 avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, h264_rnd_reg); 2159} 2160#undef H264_CHROMA_OP 2161#undef H264_CHROMA_OP4 2162#undef H264_CHROMA_MC8_TMPL 2163#undef H264_CHROMA_MC4_TMPL 2164#undef H264_CHROMA_MC2_TMPL 2165#undef H264_CHROMA_MC8_MV0 2166 2167#define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t" 2168#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\ 2169 "pavgusb " #T ", " #D " \n\t" 2170#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_3dnow 2171#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_3dnow 2172#define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow 2173#include "dsputil_h264_template_mmx.c" 2174static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2175{ 2176 avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, h264_rnd_reg); 2177} 2178static void avg_h264_chroma_mc4_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2179{ 2180 avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, h264_rnd_reg); 2181} 2182#undef H264_CHROMA_OP 2183#undef H264_CHROMA_OP4 2184#undef H264_CHROMA_MC8_TMPL 2185#undef H264_CHROMA_MC4_TMPL 2186#undef H264_CHROMA_MC8_MV0 2187 2188#if HAVE_SSSE3 2189#define AVG_OP(X) 2190#undef H264_CHROMA_MC8_TMPL 2191#undef H264_CHROMA_MC4_TMPL 2192#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3 2193#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3 2194#define H264_CHROMA_MC8_MV0 put_pixels8_mmx 2195#include "dsputil_h264_template_ssse3.c" 2196static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2197{ 2198 put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); 2199} 2200static void put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2201{ 2202 put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0); 2203} 2204 2205#undef AVG_OP 2206#undef H264_CHROMA_MC8_TMPL 2207#undef H264_CHROMA_MC4_TMPL 2208#undef H264_CHROMA_MC8_MV0 2209#define AVG_OP(X) X 2210#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3 2211#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3 2212#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 2213#include "dsputil_h264_template_ssse3.c" 2214static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2215{ 2216 avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); 2217} 2218static void avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 2219{ 2220 avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0); 2221} 2222#undef AVG_OP 2223#undef H264_CHROMA_MC8_TMPL 2224#undef H264_CHROMA_MC4_TMPL 2225#undef H264_CHROMA_MC8_MV0 2226#endif 2227 2228/***********************************/ 2229/* weighted prediction */ 2230 2231static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) 2232{ 2233 int x, y; 2234 offset <<= log2_denom; 2235 offset += (1 << log2_denom) >> 1; 2236 __asm__ volatile( 2237 "movd %0, %%mm4 \n\t" 2238 "movd %1, %%mm5 \n\t" 2239 "movd %2, %%mm6 \n\t" 2240 "pshufw $0, %%mm4, %%mm4 \n\t" 2241 "pshufw $0, %%mm5, %%mm5 \n\t" 2242 "pxor %%mm7, %%mm7 \n\t" 2243 :: "g"(weight), "g"(offset), "g"(log2_denom) 2244 ); 2245 for(y=0; y<h; y+=2){ 2246 for(x=0; x<w; x+=4){ 2247 __asm__ volatile( 2248 "movd %0, %%mm0 \n\t" 2249 "movd %1, %%mm1 \n\t" 2250 "punpcklbw %%mm7, %%mm0 \n\t" 2251 "punpcklbw %%mm7, %%mm1 \n\t" 2252 "pmullw %%mm4, %%mm0 \n\t" 2253 "pmullw %%mm4, %%mm1 \n\t" 2254 "paddsw %%mm5, %%mm0 \n\t" 2255 "paddsw %%mm5, %%mm1 \n\t" 2256 "psraw %%mm6, %%mm0 \n\t" 2257 "psraw %%mm6, %%mm1 \n\t" 2258 "packuswb %%mm7, %%mm0 \n\t" 2259 "packuswb %%mm7, %%mm1 \n\t" 2260 "movd %%mm0, %0 \n\t" 2261 "movd %%mm1, %1 \n\t" 2262 : "+m"(*(uint32_t*)(dst+x)), 2263 "+m"(*(uint32_t*)(dst+x+stride)) 2264 ); 2265 } 2266 dst += 2*stride; 2267 } 2268} 2269 2270static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h) 2271{ 2272 int x, y; 2273 offset = ((offset + 1) | 1) << log2_denom; 2274 __asm__ volatile( 2275 "movd %0, %%mm3 \n\t" 2276 "movd %1, %%mm4 \n\t" 2277 "movd %2, %%mm5 \n\t" 2278 "movd %3, %%mm6 \n\t" 2279 "pshufw $0, %%mm3, %%mm3 \n\t" 2280 "pshufw $0, %%mm4, %%mm4 \n\t" 2281 "pshufw $0, %%mm5, %%mm5 \n\t" 2282 "pxor %%mm7, %%mm7 \n\t" 2283 :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1) 2284 ); 2285 for(y=0; y<h; y++){ 2286 for(x=0; x<w; x+=4){ 2287 __asm__ volatile( 2288 "movd %0, %%mm0 \n\t" 2289 "movd %1, %%mm1 \n\t" 2290 "punpcklbw %%mm7, %%mm0 \n\t" 2291 "punpcklbw %%mm7, %%mm1 \n\t" 2292 "pmullw %%mm3, %%mm0 \n\t" 2293 "pmullw %%mm4, %%mm1 \n\t" 2294 "paddsw %%mm1, %%mm0 \n\t" 2295 "paddsw %%mm5, %%mm0 \n\t" 2296 "psraw %%mm6, %%mm0 \n\t" 2297 "packuswb %%mm0, %%mm0 \n\t" 2298 "movd %%mm0, %0 \n\t" 2299 : "+m"(*(uint32_t*)(dst+x)) 2300 : "m"(*(uint32_t*)(src+x)) 2301 ); 2302 } 2303 src += stride; 2304 dst += stride; 2305 } 2306} 2307 2308#define H264_WEIGHT(W,H) \ 2309static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ 2310 ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \ 2311} \ 2312static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \ 2313 ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \ 2314} 2315 2316H264_WEIGHT(16,16) 2317H264_WEIGHT(16, 8) 2318H264_WEIGHT( 8,16) 2319H264_WEIGHT( 8, 8) 2320H264_WEIGHT( 8, 4) 2321H264_WEIGHT( 4, 8) 2322H264_WEIGHT( 4, 4) 2323H264_WEIGHT( 4, 2) 2324 2325