1/* 2 * MMX optimized forward DCT 3 * The gcc porting is Copyright (c) 2001 Fabrice Bellard. 4 * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5 * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. 6 * 7 * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT 8 * 9 * Intel Application Note AP-922 - fast, precise implementation of DCT 10 * http://developer.intel.com/vtune/cbts/appnotes.htm 11 * 12 * Also of inspiration: 13 * a page about fdct at http://www.geocities.com/ssavekar/dct.htm 14 * Skal's fdct at http://skal.planet-d.net/coding/dct.html 15 * 16 * This file is part of FFmpeg. 17 * 18 * FFmpeg is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU Lesser General Public 20 * License as published by the Free Software Foundation; either 21 * version 2.1 of the License, or (at your option) any later version. 22 * 23 * FFmpeg is distributed in the hope that it will be useful, 24 * but WITHOUT ANY WARRANTY; without even the implied warranty of 25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26 * Lesser General Public License for more details. 27 * 28 * You should have received a copy of the GNU Lesser General Public 29 * License along with FFmpeg; if not, write to the Free Software 30 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 31 */ 32 33#include "libavutil/common.h" 34#include "libavcodec/dsputil.h" 35 36////////////////////////////////////////////////////////////////////// 37// 38// constants for the forward DCT 39// ----------------------------- 40// 41// Be sure to check that your compiler is aligning all constants to QWORD 42// (8-byte) memory boundaries! Otherwise the unaligned memory access will 43// severely stall MMX execution. 44// 45////////////////////////////////////////////////////////////////////// 46 47#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy 48#define SHIFT_FRW_COL BITS_FRW_ACC 49#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) 50#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) 51//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) 52 53#define X8(x) x,x,x,x,x,x,x,x 54 55//concatenated table, for forward DCT transformation 56DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = { 57 X8(13036), // tg * (2<<16) + 0.5 58 X8(27146), // tg * (2<<16) + 0.5 59 X8(-21746) // tg * (2<<16) + 0.5 60}; 61 62DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = { 63 X8(23170) //cos * (2<<15) + 0.5 64}; 65 66DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; 67 68DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW }; 69 70static struct 71{ 72 DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; 73} fdct_r_row_sse2 = 74{{ 75 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW 76}}; 77//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; 78 79DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table 80 16384, 16384, 22725, 19266, 81 16384, 16384, 12873, 4520, 82 21407, 8867, 19266, -4520, 83 -8867, -21407, -22725, -12873, 84 16384, -16384, 12873, -22725, 85 -16384, 16384, 4520, 19266, 86 8867, -21407, 4520, -12873, 87 21407, -8867, 19266, -22725, 88 89 22725, 22725, 31521, 26722, 90 22725, 22725, 17855, 6270, 91 29692, 12299, 26722, -6270, 92 -12299, -29692, -31521, -17855, 93 22725, -22725, 17855, -31521, 94 -22725, 22725, 6270, 26722, 95 12299, -29692, 6270, -17855, 96 29692, -12299, 26722, -31521, 97 98 21407, 21407, 29692, 25172, 99 21407, 21407, 16819, 5906, 100 27969, 11585, 25172, -5906, 101 -11585, -27969, -29692, -16819, 102 21407, -21407, 16819, -29692, 103 -21407, 21407, 5906, 25172, 104 11585, -27969, 5906, -16819, 105 27969, -11585, 25172, -29692, 106 107 19266, 19266, 26722, 22654, 108 19266, 19266, 15137, 5315, 109 25172, 10426, 22654, -5315, 110 -10426, -25172, -26722, -15137, 111 19266, -19266, 15137, -26722, 112 -19266, 19266, 5315, 22654, 113 10426, -25172, 5315, -15137, 114 25172, -10426, 22654, -26722, 115 116 16384, 16384, 22725, 19266, 117 16384, 16384, 12873, 4520, 118 21407, 8867, 19266, -4520, 119 -8867, -21407, -22725, -12873, 120 16384, -16384, 12873, -22725, 121 -16384, 16384, 4520, 19266, 122 8867, -21407, 4520, -12873, 123 21407, -8867, 19266, -22725, 124 125 19266, 19266, 26722, 22654, 126 19266, 19266, 15137, 5315, 127 25172, 10426, 22654, -5315, 128 -10426, -25172, -26722, -15137, 129 19266, -19266, 15137, -26722, 130 -19266, 19266, 5315, 22654, 131 10426, -25172, 5315, -15137, 132 25172, -10426, 22654, -26722, 133 134 21407, 21407, 29692, 25172, 135 21407, 21407, 16819, 5906, 136 27969, 11585, 25172, -5906, 137 -11585, -27969, -29692, -16819, 138 21407, -21407, 16819, -29692, 139 -21407, 21407, 5906, 25172, 140 11585, -27969, 5906, -16819, 141 27969, -11585, 25172, -29692, 142 143 22725, 22725, 31521, 26722, 144 22725, 22725, 17855, 6270, 145 29692, 12299, 26722, -6270, 146 -12299, -29692, -31521, -17855, 147 22725, -22725, 17855, -31521, 148 -22725, 22725, 6270, 26722, 149 12299, -29692, 6270, -17855, 150 29692, -12299, 26722, -31521, 151}; 152 153static struct 154{ 155 DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; 156} tab_frw_01234567_sse2 = 157{{ 158//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table 159#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ 160 C4, C4, C5, C7, C2, C6, C3, -C7, \ 161 -C4, C4, C7, C3, C6, -C2, C7, -C5, \ 162 C4, -C4, C5, -C1, C2, -C6, C3, -C1, 163// c1..c7 * cos(pi/4) * 2^15 164#define C1 22725 165#define C2 21407 166#define C3 19266 167#define C4 16384 168#define C5 12873 169#define C6 8867 170#define C7 4520 171TABLE_SSE2 172 173#undef C1 174#undef C2 175#undef C3 176#undef C4 177#undef C5 178#undef C6 179#undef C7 180#define C1 31521 181#define C2 29692 182#define C3 26722 183#define C4 22725 184#define C5 17855 185#define C6 12299 186#define C7 6270 187TABLE_SSE2 188 189#undef C1 190#undef C2 191#undef C3 192#undef C4 193#undef C5 194#undef C6 195#undef C7 196#define C1 29692 197#define C2 27969 198#define C3 25172 199#define C4 21407 200#define C5 16819 201#define C6 11585 202#define C7 5906 203TABLE_SSE2 204 205#undef C1 206#undef C2 207#undef C3 208#undef C4 209#undef C5 210#undef C6 211#undef C7 212#define C1 26722 213#define C2 25172 214#define C3 22654 215#define C4 19266 216#define C5 15137 217#define C6 10426 218#define C7 5315 219TABLE_SSE2 220 221#undef C1 222#undef C2 223#undef C3 224#undef C4 225#undef C5 226#undef C6 227#undef C7 228#define C1 22725 229#define C2 21407 230#define C3 19266 231#define C4 16384 232#define C5 12873 233#define C6 8867 234#define C7 4520 235TABLE_SSE2 236 237#undef C1 238#undef C2 239#undef C3 240#undef C4 241#undef C5 242#undef C6 243#undef C7 244#define C1 26722 245#define C2 25172 246#define C3 22654 247#define C4 19266 248#define C5 15137 249#define C6 10426 250#define C7 5315 251TABLE_SSE2 252 253#undef C1 254#undef C2 255#undef C3 256#undef C4 257#undef C5 258#undef C6 259#undef C7 260#define C1 29692 261#define C2 27969 262#define C3 25172 263#define C4 21407 264#define C5 16819 265#define C6 11585 266#define C7 5906 267TABLE_SSE2 268 269#undef C1 270#undef C2 271#undef C3 272#undef C4 273#undef C5 274#undef C6 275#undef C7 276#define C1 31521 277#define C2 29692 278#define C3 26722 279#define C4 22725 280#define C5 17855 281#define C6 12299 282#define C7 6270 283TABLE_SSE2 284}}; 285 286#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long 287 288#define FDCT_COL(cpu, mm, mov)\ 289static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ 290{\ 291 __asm__ volatile (\ 292 #mov" 16(%0), %%"#mm"0 \n\t" \ 293 #mov" 96(%0), %%"#mm"1 \n\t" \ 294 #mov" %%"#mm"0, %%"#mm"2 \n\t" \ 295 #mov" 32(%0), %%"#mm"3 \n\t" \ 296 "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ 297 #mov" 80(%0), %%"#mm"4 \n\t" \ 298 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ 299 #mov" (%0), %%"#mm"5 \n\t" \ 300 "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ 301 "paddsw 112(%0), %%"#mm"5 \n\t" \ 302 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ 303 #mov" %%"#mm"0, %%"#mm"6 \n\t" \ 304 "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ 305 #mov" 16(%1), %%"#mm"1 \n\t" \ 306 "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ 307 #mov" 48(%0), %%"#mm"7 \n\t" \ 308 "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ 309 "paddsw 64(%0), %%"#mm"7 \n\t" \ 310 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ 311 "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ 312 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ 313 #mov" %%"#mm"5, %%"#mm"4 \n\t" \ 314 "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ 315 "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ 316 "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ 317 "por (%2), %%"#mm"1 \n\t" \ 318 "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ 319 "pmulhw 16(%1), %%"#mm"5 \n\t" \ 320 #mov" %%"#mm"4, %%"#mm"7 \n\t" \ 321 "psubsw 80(%0), %%"#mm"3 \n\t" \ 322 "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ 323 #mov" %%"#mm"1, 32(%3) \n\t" \ 324 "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ 325 #mov" 48(%0), %%"#mm"1 \n\t" \ 326 "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ 327 "psubsw 64(%0), %%"#mm"1 \n\t" \ 328 #mov" %%"#mm"2, %%"#mm"6 \n\t" \ 329 #mov" %%"#mm"4, 64(%3) \n\t" \ 330 "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ 331 "pmulhw (%4), %%"#mm"2 \n\t" \ 332 "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ 333 "pmulhw (%4), %%"#mm"6 \n\t" \ 334 "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ 335 "por (%2), %%"#mm"5 \n\t" \ 336 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ 337 "por (%2), %%"#mm"2 \n\t" \ 338 #mov" %%"#mm"1, %%"#mm"4 \n\t" \ 339 #mov" (%0), %%"#mm"3 \n\t" \ 340 "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ 341 "psubsw 112(%0), %%"#mm"3 \n\t" \ 342 "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ 343 #mov" (%1), %%"#mm"0 \n\t" \ 344 "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ 345 #mov" 32(%1), %%"#mm"6 \n\t" \ 346 "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ 347 #mov" %%"#mm"7, (%3) \n\t" \ 348 "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ 349 #mov" %%"#mm"5, 96(%3) \n\t" \ 350 #mov" %%"#mm"3, %%"#mm"7 \n\t" \ 351 #mov" 32(%1), %%"#mm"5 \n\t" \ 352 "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ 353 "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ 354 "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ 355 "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ 356 "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ 357 "pmulhw (%1), %%"#mm"3 \n\t" \ 358 "por (%2), %%"#mm"0 \n\t" \ 359 "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ 360 "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ 361 #mov" %%"#mm"0, 16(%3) \n\t" \ 362 "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ 363 #mov" %%"#mm"7, 48(%3) \n\t" \ 364 "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ 365 #mov" %%"#mm"5, 80(%3) \n\t" \ 366 #mov" %%"#mm"3, 112(%3) \n\t" \ 367 : \ 368 : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ 369 "r" (out + offset), "r" (ocos_4_16)); \ 370} 371 372FDCT_COL(mmx, mm, movq) 373FDCT_COL(sse2, xmm, movdqa) 374 375static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) 376{ 377 __asm__ volatile( 378#define FDCT_ROW_SSE2_H1(i,t) \ 379 "movq " #i "(%0), %%xmm2 \n\t" \ 380 "movq " #i "+8(%0), %%xmm0 \n\t" \ 381 "movdqa " #t "+32(%1), %%xmm3 \n\t" \ 382 "movdqa " #t "+48(%1), %%xmm7 \n\t" \ 383 "movdqa " #t "(%1), %%xmm4 \n\t" \ 384 "movdqa " #t "+16(%1), %%xmm5 \n\t" 385 386#define FDCT_ROW_SSE2_H2(i,t) \ 387 "movq " #i "(%0), %%xmm2 \n\t" \ 388 "movq " #i "+8(%0), %%xmm0 \n\t" \ 389 "movdqa " #t "+32(%1), %%xmm3 \n\t" \ 390 "movdqa " #t "+48(%1), %%xmm7 \n\t" 391 392#define FDCT_ROW_SSE2(i) \ 393 "movq %%xmm2, %%xmm1 \n\t" \ 394 "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ 395 "paddsw %%xmm0, %%xmm1 \n\t" \ 396 "psubsw %%xmm0, %%xmm2 \n\t" \ 397 "punpckldq %%xmm2, %%xmm1 \n\t" \ 398 "pshufd $78, %%xmm1, %%xmm2 \n\t" \ 399 "pmaddwd %%xmm2, %%xmm3 \n\t" \ 400 "pmaddwd %%xmm1, %%xmm7 \n\t" \ 401 "pmaddwd %%xmm5, %%xmm2 \n\t" \ 402 "pmaddwd %%xmm4, %%xmm1 \n\t" \ 403 "paddd %%xmm7, %%xmm3 \n\t" \ 404 "paddd %%xmm2, %%xmm1 \n\t" \ 405 "paddd %%xmm6, %%xmm3 \n\t" \ 406 "paddd %%xmm6, %%xmm1 \n\t" \ 407 "psrad %3, %%xmm3 \n\t" \ 408 "psrad %3, %%xmm1 \n\t" \ 409 "packssdw %%xmm3, %%xmm1 \n\t" \ 410 "movdqa %%xmm1, " #i "(%4) \n\t" 411 412 "movdqa (%2), %%xmm6 \n\t" 413 FDCT_ROW_SSE2_H1(0,0) 414 FDCT_ROW_SSE2(0) 415 FDCT_ROW_SSE2_H2(64,0) 416 FDCT_ROW_SSE2(64) 417 418 FDCT_ROW_SSE2_H1(16,64) 419 FDCT_ROW_SSE2(16) 420 FDCT_ROW_SSE2_H2(112,64) 421 FDCT_ROW_SSE2(112) 422 423 FDCT_ROW_SSE2_H1(32,128) 424 FDCT_ROW_SSE2(32) 425 FDCT_ROW_SSE2_H2(96,128) 426 FDCT_ROW_SSE2(96) 427 428 FDCT_ROW_SSE2_H1(48,192) 429 FDCT_ROW_SSE2(48) 430 FDCT_ROW_SSE2_H2(80,192) 431 FDCT_ROW_SSE2(80) 432 : 433 : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) 434 ); 435} 436 437static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) 438{ 439 __asm__ volatile ( 440 "pshufw $0x1B, 8(%0), %%mm5 \n\t" 441 "movq (%0), %%mm0 \n\t" 442 "movq %%mm0, %%mm1 \n\t" 443 "paddsw %%mm5, %%mm0 \n\t" 444 "psubsw %%mm5, %%mm1 \n\t" 445 "movq %%mm0, %%mm2 \n\t" 446 "punpckldq %%mm1, %%mm0 \n\t" 447 "punpckhdq %%mm1, %%mm2 \n\t" 448 "movq (%1), %%mm1 \n\t" 449 "movq 8(%1), %%mm3 \n\t" 450 "movq 16(%1), %%mm4 \n\t" 451 "movq 24(%1), %%mm5 \n\t" 452 "movq 32(%1), %%mm6 \n\t" 453 "movq 40(%1), %%mm7 \n\t" 454 "pmaddwd %%mm0, %%mm1 \n\t" 455 "pmaddwd %%mm2, %%mm3 \n\t" 456 "pmaddwd %%mm0, %%mm4 \n\t" 457 "pmaddwd %%mm2, %%mm5 \n\t" 458 "pmaddwd %%mm0, %%mm6 \n\t" 459 "pmaddwd %%mm2, %%mm7 \n\t" 460 "pmaddwd 48(%1), %%mm0 \n\t" 461 "pmaddwd 56(%1), %%mm2 \n\t" 462 "paddd %%mm1, %%mm3 \n\t" 463 "paddd %%mm4, %%mm5 \n\t" 464 "paddd %%mm6, %%mm7 \n\t" 465 "paddd %%mm0, %%mm2 \n\t" 466 "movq (%2), %%mm0 \n\t" 467 "paddd %%mm0, %%mm3 \n\t" 468 "paddd %%mm0, %%mm5 \n\t" 469 "paddd %%mm0, %%mm7 \n\t" 470 "paddd %%mm0, %%mm2 \n\t" 471 "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" 472 "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" 473 "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" 474 "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" 475 "packssdw %%mm5, %%mm3 \n\t" 476 "packssdw %%mm2, %%mm7 \n\t" 477 "movq %%mm3, (%3) \n\t" 478 "movq %%mm7, 8(%3) \n\t" 479 : 480 : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); 481} 482 483static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table) 484{ 485 //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...) 486 __asm__ volatile( 487 "movd 12(%0), %%mm1 \n\t" 488 "punpcklwd 8(%0), %%mm1 \n\t" 489 "movq %%mm1, %%mm2 \n\t" 490 "psrlq $0x20, %%mm1 \n\t" 491 "movq 0(%0), %%mm0 \n\t" 492 "punpcklwd %%mm2, %%mm1 \n\t" 493 "movq %%mm0, %%mm5 \n\t" 494 "paddsw %%mm1, %%mm0 \n\t" 495 "psubsw %%mm1, %%mm5 \n\t" 496 "movq %%mm0, %%mm2 \n\t" 497 "punpckldq %%mm5, %%mm0 \n\t" 498 "punpckhdq %%mm5, %%mm2 \n\t" 499 "movq 0(%1), %%mm1 \n\t" 500 "movq 8(%1), %%mm3 \n\t" 501 "movq 16(%1), %%mm4 \n\t" 502 "movq 24(%1), %%mm5 \n\t" 503 "movq 32(%1), %%mm6 \n\t" 504 "movq 40(%1), %%mm7 \n\t" 505 "pmaddwd %%mm0, %%mm1 \n\t" 506 "pmaddwd %%mm2, %%mm3 \n\t" 507 "pmaddwd %%mm0, %%mm4 \n\t" 508 "pmaddwd %%mm2, %%mm5 \n\t" 509 "pmaddwd %%mm0, %%mm6 \n\t" 510 "pmaddwd %%mm2, %%mm7 \n\t" 511 "pmaddwd 48(%1), %%mm0 \n\t" 512 "pmaddwd 56(%1), %%mm2 \n\t" 513 "paddd %%mm1, %%mm3 \n\t" 514 "paddd %%mm4, %%mm5 \n\t" 515 "paddd %%mm6, %%mm7 \n\t" 516 "paddd %%mm0, %%mm2 \n\t" 517 "movq (%2), %%mm0 \n\t" 518 "paddd %%mm0, %%mm3 \n\t" 519 "paddd %%mm0, %%mm5 \n\t" 520 "paddd %%mm0, %%mm7 \n\t" 521 "paddd %%mm0, %%mm2 \n\t" 522 "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" 523 "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" 524 "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" 525 "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" 526 "packssdw %%mm5, %%mm3 \n\t" 527 "packssdw %%mm2, %%mm7 \n\t" 528 "movq %%mm3, 0(%3) \n\t" 529 "movq %%mm7, 8(%3) \n\t" 530 : 531 : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); 532} 533 534void ff_fdct_mmx(int16_t *block) 535{ 536 DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; 537 int16_t * block1= (int16_t*)align_tmp; 538 const int16_t *table= tab_frw_01234567; 539 int i; 540 541 fdct_col_mmx(block, block1, 0); 542 fdct_col_mmx(block, block1, 4); 543 544 for(i=8;i>0;i--) { 545 fdct_row_mmx(block1, block, table); 546 block1 += 8; 547 table += 32; 548 block += 8; 549 } 550} 551 552void ff_fdct_mmx2(int16_t *block) 553{ 554 DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; 555 int16_t *block1= (int16_t*)align_tmp; 556 const int16_t *table= tab_frw_01234567; 557 int i; 558 559 fdct_col_mmx(block, block1, 0); 560 fdct_col_mmx(block, block1, 4); 561 562 for(i=8;i>0;i--) { 563 fdct_row_mmx2(block1, block, table); 564 block1 += 8; 565 table += 32; 566 block += 8; 567 } 568} 569 570void ff_fdct_sse2(int16_t *block) 571{ 572 DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; 573 int16_t * const block1= (int16_t*)align_tmp; 574 575 fdct_col_sse2(block, block1, 0); 576 fdct_row_sse2(block1, block); 577} 578 579