1/* 2 * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> 3 * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/attributes.h" 23#include "libavutil/cpu.h" 24#include "libavutil/x86/asm.h" 25#include "libavutil/x86/cpu.h" 26#include "libavcodec/avcodec.h" 27#include "libavcodec/mpegvideo.h" 28 29#if HAVE_MMX_INLINE 30 31static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, 32 int16_t *block, int n, int qscale) 33{ 34 x86_reg level, qmul, qadd, nCoeffs; 35 36 qmul = qscale << 1; 37 38 av_assert2(s->block_last_index[n]>=0 || s->h263_aic); 39 40 if (!s->h263_aic) { 41 if (n < 4) 42 level = block[0] * s->y_dc_scale; 43 else 44 level = block[0] * s->c_dc_scale; 45 qadd = (qscale - 1) | 1; 46 }else{ 47 qadd = 0; 48 level= block[0]; 49 } 50 if(s->ac_pred) 51 nCoeffs=63; 52 else 53 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; 54 55__asm__ volatile( 56 "movd %1, %%mm6 \n\t" //qmul 57 "packssdw %%mm6, %%mm6 \n\t" 58 "packssdw %%mm6, %%mm6 \n\t" 59 "movd %2, %%mm5 \n\t" //qadd 60 "pxor %%mm7, %%mm7 \n\t" 61 "packssdw %%mm5, %%mm5 \n\t" 62 "packssdw %%mm5, %%mm5 \n\t" 63 "psubw %%mm5, %%mm7 \n\t" 64 "pxor %%mm4, %%mm4 \n\t" 65 ".p2align 4 \n\t" 66 "1: \n\t" 67 "movq (%0, %3), %%mm0 \n\t" 68 "movq 8(%0, %3), %%mm1 \n\t" 69 70 "pmullw %%mm6, %%mm0 \n\t" 71 "pmullw %%mm6, %%mm1 \n\t" 72 73 "movq (%0, %3), %%mm2 \n\t" 74 "movq 8(%0, %3), %%mm3 \n\t" 75 76 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 77 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 78 79 "pxor %%mm2, %%mm0 \n\t" 80 "pxor %%mm3, %%mm1 \n\t" 81 82 "paddw %%mm7, %%mm0 \n\t" 83 "paddw %%mm7, %%mm1 \n\t" 84 85 "pxor %%mm0, %%mm2 \n\t" 86 "pxor %%mm1, %%mm3 \n\t" 87 88 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 89 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 90 91 "pandn %%mm2, %%mm0 \n\t" 92 "pandn %%mm3, %%mm1 \n\t" 93 94 "movq %%mm0, (%0, %3) \n\t" 95 "movq %%mm1, 8(%0, %3) \n\t" 96 97 "add $16, %3 \n\t" 98 "jng 1b \n\t" 99 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) 100 : "memory" 101 ); 102 block[0]= level; 103} 104 105 106static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, 107 int16_t *block, int n, int qscale) 108{ 109 x86_reg qmul, qadd, nCoeffs; 110 111 qmul = qscale << 1; 112 qadd = (qscale - 1) | 1; 113 114 av_assert2(s->block_last_index[n]>=0 || s->h263_aic); 115 116 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; 117 118__asm__ volatile( 119 "movd %1, %%mm6 \n\t" //qmul 120 "packssdw %%mm6, %%mm6 \n\t" 121 "packssdw %%mm6, %%mm6 \n\t" 122 "movd %2, %%mm5 \n\t" //qadd 123 "pxor %%mm7, %%mm7 \n\t" 124 "packssdw %%mm5, %%mm5 \n\t" 125 "packssdw %%mm5, %%mm5 \n\t" 126 "psubw %%mm5, %%mm7 \n\t" 127 "pxor %%mm4, %%mm4 \n\t" 128 ".p2align 4 \n\t" 129 "1: \n\t" 130 "movq (%0, %3), %%mm0 \n\t" 131 "movq 8(%0, %3), %%mm1 \n\t" 132 133 "pmullw %%mm6, %%mm0 \n\t" 134 "pmullw %%mm6, %%mm1 \n\t" 135 136 "movq (%0, %3), %%mm2 \n\t" 137 "movq 8(%0, %3), %%mm3 \n\t" 138 139 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 140 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 141 142 "pxor %%mm2, %%mm0 \n\t" 143 "pxor %%mm3, %%mm1 \n\t" 144 145 "paddw %%mm7, %%mm0 \n\t" 146 "paddw %%mm7, %%mm1 \n\t" 147 148 "pxor %%mm0, %%mm2 \n\t" 149 "pxor %%mm1, %%mm3 \n\t" 150 151 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 152 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 153 154 "pandn %%mm2, %%mm0 \n\t" 155 "pandn %%mm3, %%mm1 \n\t" 156 157 "movq %%mm0, (%0, %3) \n\t" 158 "movq %%mm1, 8(%0, %3) \n\t" 159 160 "add $16, %3 \n\t" 161 "jng 1b \n\t" 162 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) 163 : "memory" 164 ); 165} 166 167static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, 168 int16_t *block, int n, int qscale) 169{ 170 x86_reg nCoeffs; 171 const uint16_t *quant_matrix; 172 int block0; 173 174 av_assert2(s->block_last_index[n]>=0); 175 176 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; 177 178 if (n < 4) 179 block0 = block[0] * s->y_dc_scale; 180 else 181 block0 = block[0] * s->c_dc_scale; 182 /* XXX: only mpeg1 */ 183 quant_matrix = s->intra_matrix; 184__asm__ volatile( 185 "pcmpeqw %%mm7, %%mm7 \n\t" 186 "psrlw $15, %%mm7 \n\t" 187 "movd %2, %%mm6 \n\t" 188 "packssdw %%mm6, %%mm6 \n\t" 189 "packssdw %%mm6, %%mm6 \n\t" 190 "mov %3, %%"REG_a" \n\t" 191 ".p2align 4 \n\t" 192 "1: \n\t" 193 "movq (%0, %%"REG_a"), %%mm0 \n\t" 194 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 195 "movq (%1, %%"REG_a"), %%mm4 \n\t" 196 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 197 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 198 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 199 "pxor %%mm2, %%mm2 \n\t" 200 "pxor %%mm3, %%mm3 \n\t" 201 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 202 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 203 "pxor %%mm2, %%mm0 \n\t" 204 "pxor %%mm3, %%mm1 \n\t" 205 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 206 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 207 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 208 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 209 "pxor %%mm4, %%mm4 \n\t" 210 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 211 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 212 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 213 "psraw $3, %%mm0 \n\t" 214 "psraw $3, %%mm1 \n\t" 215 "psubw %%mm7, %%mm0 \n\t" 216 "psubw %%mm7, %%mm1 \n\t" 217 "por %%mm7, %%mm0 \n\t" 218 "por %%mm7, %%mm1 \n\t" 219 "pxor %%mm2, %%mm0 \n\t" 220 "pxor %%mm3, %%mm1 \n\t" 221 "psubw %%mm2, %%mm0 \n\t" 222 "psubw %%mm3, %%mm1 \n\t" 223 "pandn %%mm0, %%mm4 \n\t" 224 "pandn %%mm1, %%mm5 \n\t" 225 "movq %%mm4, (%0, %%"REG_a") \n\t" 226 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 227 228 "add $16, %%"REG_a" \n\t" 229 "js 1b \n\t" 230 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) 231 : "%"REG_a, "memory" 232 ); 233 block[0]= block0; 234} 235 236static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, 237 int16_t *block, int n, int qscale) 238{ 239 x86_reg nCoeffs; 240 const uint16_t *quant_matrix; 241 242 av_assert2(s->block_last_index[n]>=0); 243 244 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; 245 246 quant_matrix = s->inter_matrix; 247__asm__ volatile( 248 "pcmpeqw %%mm7, %%mm7 \n\t" 249 "psrlw $15, %%mm7 \n\t" 250 "movd %2, %%mm6 \n\t" 251 "packssdw %%mm6, %%mm6 \n\t" 252 "packssdw %%mm6, %%mm6 \n\t" 253 "mov %3, %%"REG_a" \n\t" 254 ".p2align 4 \n\t" 255 "1: \n\t" 256 "movq (%0, %%"REG_a"), %%mm0 \n\t" 257 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 258 "movq (%1, %%"REG_a"), %%mm4 \n\t" 259 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 260 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 261 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 262 "pxor %%mm2, %%mm2 \n\t" 263 "pxor %%mm3, %%mm3 \n\t" 264 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 265 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 266 "pxor %%mm2, %%mm0 \n\t" 267 "pxor %%mm3, %%mm1 \n\t" 268 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 269 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 270 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 271 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 272 "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 273 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 274 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 275 "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 276 "pxor %%mm4, %%mm4 \n\t" 277 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 278 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 279 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 280 "psraw $4, %%mm0 \n\t" 281 "psraw $4, %%mm1 \n\t" 282 "psubw %%mm7, %%mm0 \n\t" 283 "psubw %%mm7, %%mm1 \n\t" 284 "por %%mm7, %%mm0 \n\t" 285 "por %%mm7, %%mm1 \n\t" 286 "pxor %%mm2, %%mm0 \n\t" 287 "pxor %%mm3, %%mm1 \n\t" 288 "psubw %%mm2, %%mm0 \n\t" 289 "psubw %%mm3, %%mm1 \n\t" 290 "pandn %%mm0, %%mm4 \n\t" 291 "pandn %%mm1, %%mm5 \n\t" 292 "movq %%mm4, (%0, %%"REG_a") \n\t" 293 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 294 295 "add $16, %%"REG_a" \n\t" 296 "js 1b \n\t" 297 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) 298 : "%"REG_a, "memory" 299 ); 300} 301 302static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, 303 int16_t *block, int n, int qscale) 304{ 305 x86_reg nCoeffs; 306 const uint16_t *quant_matrix; 307 int block0; 308 309 av_assert2(s->block_last_index[n]>=0); 310 311 if(s->alternate_scan) nCoeffs= 63; //FIXME 312 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; 313 314 if (n < 4) 315 block0 = block[0] * s->y_dc_scale; 316 else 317 block0 = block[0] * s->c_dc_scale; 318 quant_matrix = s->intra_matrix; 319__asm__ volatile( 320 "pcmpeqw %%mm7, %%mm7 \n\t" 321 "psrlw $15, %%mm7 \n\t" 322 "movd %2, %%mm6 \n\t" 323 "packssdw %%mm6, %%mm6 \n\t" 324 "packssdw %%mm6, %%mm6 \n\t" 325 "mov %3, %%"REG_a" \n\t" 326 ".p2align 4 \n\t" 327 "1: \n\t" 328 "movq (%0, %%"REG_a"), %%mm0 \n\t" 329 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 330 "movq (%1, %%"REG_a"), %%mm4 \n\t" 331 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 332 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 333 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 334 "pxor %%mm2, %%mm2 \n\t" 335 "pxor %%mm3, %%mm3 \n\t" 336 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 337 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 338 "pxor %%mm2, %%mm0 \n\t" 339 "pxor %%mm3, %%mm1 \n\t" 340 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 341 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 342 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 343 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 344 "pxor %%mm4, %%mm4 \n\t" 345 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 346 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 347 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 348 "psraw $3, %%mm0 \n\t" 349 "psraw $3, %%mm1 \n\t" 350 "pxor %%mm2, %%mm0 \n\t" 351 "pxor %%mm3, %%mm1 \n\t" 352 "psubw %%mm2, %%mm0 \n\t" 353 "psubw %%mm3, %%mm1 \n\t" 354 "pandn %%mm0, %%mm4 \n\t" 355 "pandn %%mm1, %%mm5 \n\t" 356 "movq %%mm4, (%0, %%"REG_a") \n\t" 357 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 358 359 "add $16, %%"REG_a" \n\t" 360 "jng 1b \n\t" 361 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) 362 : "%"REG_a, "memory" 363 ); 364 block[0]= block0; 365 //Note, we do not do mismatch control for intra as errors cannot accumulate 366} 367 368static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, 369 int16_t *block, int n, int qscale) 370{ 371 x86_reg nCoeffs; 372 const uint16_t *quant_matrix; 373 374 av_assert2(s->block_last_index[n]>=0); 375 376 if(s->alternate_scan) nCoeffs= 63; //FIXME 377 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; 378 379 quant_matrix = s->inter_matrix; 380__asm__ volatile( 381 "pcmpeqw %%mm7, %%mm7 \n\t" 382 "psrlq $48, %%mm7 \n\t" 383 "movd %2, %%mm6 \n\t" 384 "packssdw %%mm6, %%mm6 \n\t" 385 "packssdw %%mm6, %%mm6 \n\t" 386 "mov %3, %%"REG_a" \n\t" 387 ".p2align 4 \n\t" 388 "1: \n\t" 389 "movq (%0, %%"REG_a"), %%mm0 \n\t" 390 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 391 "movq (%1, %%"REG_a"), %%mm4 \n\t" 392 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 393 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 394 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 395 "pxor %%mm2, %%mm2 \n\t" 396 "pxor %%mm3, %%mm3 \n\t" 397 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 398 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 399 "pxor %%mm2, %%mm0 \n\t" 400 "pxor %%mm3, %%mm1 \n\t" 401 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 402 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 403 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 404 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 405 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q 406 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q 407 "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 408 "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 409 "pxor %%mm4, %%mm4 \n\t" 410 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 411 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 412 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 413 "psrlw $4, %%mm0 \n\t" 414 "psrlw $4, %%mm1 \n\t" 415 "pxor %%mm2, %%mm0 \n\t" 416 "pxor %%mm3, %%mm1 \n\t" 417 "psubw %%mm2, %%mm0 \n\t" 418 "psubw %%mm3, %%mm1 \n\t" 419 "pandn %%mm0, %%mm4 \n\t" 420 "pandn %%mm1, %%mm5 \n\t" 421 "pxor %%mm4, %%mm7 \n\t" 422 "pxor %%mm5, %%mm7 \n\t" 423 "movq %%mm4, (%0, %%"REG_a") \n\t" 424 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 425 426 "add $16, %%"REG_a" \n\t" 427 "jng 1b \n\t" 428 "movd 124(%0, %3), %%mm0 \n\t" 429 "movq %%mm7, %%mm6 \n\t" 430 "psrlq $32, %%mm7 \n\t" 431 "pxor %%mm6, %%mm7 \n\t" 432 "movq %%mm7, %%mm6 \n\t" 433 "psrlq $16, %%mm7 \n\t" 434 "pxor %%mm6, %%mm7 \n\t" 435 "pslld $31, %%mm7 \n\t" 436 "psrlq $15, %%mm7 \n\t" 437 "pxor %%mm7, %%mm0 \n\t" 438 "movd %%mm0, 124(%0, %3) \n\t" 439 440 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) 441 : "%"REG_a, "memory" 442 ); 443} 444 445#endif /* HAVE_MMX_INLINE */ 446 447av_cold void ff_MPV_common_init_x86(MpegEncContext *s) 448{ 449#if HAVE_MMX_INLINE 450 int cpu_flags = av_get_cpu_flags(); 451 452 if (INLINE_MMX(cpu_flags)) { 453 s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; 454 s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; 455 s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; 456 s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; 457 if(!(s->flags & CODEC_FLAG_BITEXACT)) 458 s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; 459 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; 460 } 461#endif /* HAVE_MMX_INLINE */ 462} 463