1/* 2 * Copyright (C) 2004 the ffmpeg project 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21/** 22 * @file libavcodec/x86/vp3dsp_mmx.c 23 * MMX-optimized functions cribbed from the original VP3 source code. 24 */ 25 26#include "libavutil/x86_cpu.h" 27#include "libavcodec/dsputil.h" 28#include "dsputil_mmx.h" 29 30extern const uint16_t ff_vp3_idct_data[]; 31 32// this is off by one or two for some cases when filter_limit is greater than 63 33// in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 34// out: p1 in mm4, p2 in mm3 35#define VP3_LOOP_FILTER(flim) \ 36 "movq %%mm6, %%mm7 \n\t" \ 37 "pand "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \ 38 "psrlw $3, %%mm7 \n\t" \ 39 "pand "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \ 40 "movq %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \ 41 "pxor %%mm4, %%mm2 \n\t" \ 42 "pand "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \ 43 "movq %%mm2, %%mm5 \n\t" \ 44 "paddb %%mm2, %%mm2 \n\t" \ 45 "paddb %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \ 46 "paddb %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \ 47 "pcmpeqb %%mm0, %%mm0 \n\t" \ 48 "pxor %%mm0, %%mm1 \n\t" /* 255 - p3 */ \ 49 "pavgb %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \ 50 "pxor %%mm4, %%mm0 \n\t" /* 255 - p1 */ \ 51 "pavgb %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \ 52 "paddb "MANGLE(ff_pb_3 )", %%mm1 \n\t" \ 53 "pavgb %%mm0, %%mm1 \n\t" /* 128+2+( p2-p1 - p3) >> 2 */ \ 54 "pavgb %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \ 55 "paddusb %%mm1, %%mm7 \n\t" /* d+128+1 */ \ 56 "movq "MANGLE(ff_pb_81)", %%mm6 \n\t" \ 57 "psubusb %%mm7, %%mm6 \n\t" \ 58 "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \ 59\ 60 "movq "#flim", %%mm5 \n\t" \ 61 "pminub %%mm5, %%mm6 \n\t" \ 62 "pminub %%mm5, %%mm7 \n\t" \ 63 "movq %%mm6, %%mm0 \n\t" \ 64 "movq %%mm7, %%mm1 \n\t" \ 65 "paddb %%mm6, %%mm6 \n\t" \ 66 "paddb %%mm7, %%mm7 \n\t" \ 67 "pminub %%mm5, %%mm6 \n\t" \ 68 "pminub %%mm5, %%mm7 \n\t" \ 69 "psubb %%mm0, %%mm6 \n\t" \ 70 "psubb %%mm1, %%mm7 \n\t" \ 71 "paddusb %%mm7, %%mm4 \n\t" \ 72 "psubusb %%mm6, %%mm4 \n\t" \ 73 "psubusb %%mm7, %%mm3 \n\t" \ 74 "paddusb %%mm6, %%mm3 \n\t" 75 76#define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \ 77 "movd "#mm", %0 \n\t" \ 78 "movw %w0, -1"#dst0" \n\t" \ 79 "psrlq $32, "#mm" \n\t" \ 80 "shr $16, %0 \n\t" \ 81 "movw %w0, -1"#dst1" \n\t" \ 82 "movd "#mm", %0 \n\t" \ 83 "movw %w0, -1"#dst2" \n\t" \ 84 "shr $16, %0 \n\t" \ 85 "movw %w0, -1"#dst3" \n\t" 86 87void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) 88{ 89 __asm__ volatile( 90 "movq %0, %%mm6 \n\t" 91 "movq %1, %%mm4 \n\t" 92 "movq %2, %%mm2 \n\t" 93 "movq %3, %%mm1 \n\t" 94 95 VP3_LOOP_FILTER(%4) 96 97 "movq %%mm4, %1 \n\t" 98 "movq %%mm3, %2 \n\t" 99 100 : "+m" (*(uint64_t*)(src - 2*stride)), 101 "+m" (*(uint64_t*)(src - 1*stride)), 102 "+m" (*(uint64_t*)(src + 0*stride)), 103 "+m" (*(uint64_t*)(src + 1*stride)) 104 : "m"(*(uint64_t*)(bounding_values+129)) 105 ); 106} 107 108void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) 109{ 110 x86_reg tmp; 111 112 __asm__ volatile( 113 "movd -2(%1), %%mm6 \n\t" 114 "movd -2(%1,%3), %%mm0 \n\t" 115 "movd -2(%1,%3,2), %%mm1 \n\t" 116 "movd -2(%1,%4), %%mm4 \n\t" 117 118 TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2) 119 VP3_LOOP_FILTER(%5) 120 SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q) 121 122 STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4) 123 STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5) 124 125 : "=&r"(tmp) 126 : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride), 127 "m"(*(uint64_t*)(bounding_values+129)) 128 : "memory" 129 ); 130} 131 132/* from original comments: The Macro does IDct on 4 1-D Dcts */ 133#define BeginIDCT() \ 134 "movq "I(3)", %%mm2 \n\t" \ 135 "movq "C(3)", %%mm6 \n\t" \ 136 "movq %%mm2, %%mm4 \n\t" \ 137 "movq "J(5)", %%mm7 \n\t" \ 138 "pmulhw %%mm6, %%mm4 \n\t" /* r4 = c3*i3 - i3 */ \ 139 "movq "C(5)", %%mm1 \n\t" \ 140 "pmulhw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 - i5 */ \ 141 "movq %%mm1, %%mm5 \n\t" \ 142 "pmulhw %%mm2, %%mm1 \n\t" /* r1 = c5*i3 - i3 */ \ 143 "movq "I(1)", %%mm3 \n\t" \ 144 "pmulhw %%mm7, %%mm5 \n\t" /* r5 = c5*i5 - i5 */ \ 145 "movq "C(1)", %%mm0 \n\t" \ 146 "paddw %%mm2, %%mm4 \n\t" /* r4 = c3*i3 */ \ 147 "paddw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 */ \ 148 "paddw %%mm1, %%mm2 \n\t" /* r2 = c5*i3 */ \ 149 "movq "J(7)", %%mm1 \n\t" \ 150 "paddw %%mm5, %%mm7 \n\t" /* r7 = c5*i5 */ \ 151 "movq %%mm0, %%mm5 \n\t" /* r5 = c1 */ \ 152 "pmulhw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 - i1 */ \ 153 "paddsw %%mm7, %%mm4 \n\t" /* r4 = C = c3*i3 + c5*i5 */ \ 154 "pmulhw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 - i7 */ \ 155 "movq "C(7)", %%mm7 \n\t" \ 156 "psubsw %%mm2, %%mm6 \n\t" /* r6 = D = c3*i5 - c5*i3 */ \ 157 "paddw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 */ \ 158 "pmulhw %%mm7, %%mm3 \n\t" /* r3 = c7*i1 */ \ 159 "movq "I(2)", %%mm2 \n\t" \ 160 "pmulhw %%mm1, %%mm7 \n\t" /* r7 = c7*i7 */ \ 161 "paddw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 */ \ 162 "movq %%mm2, %%mm1 \n\t" /* r1 = i2 */ \ 163 "pmulhw "C(2)", %%mm2 \n\t" /* r2 = c2*i2 - i2 */ \ 164 "psubsw %%mm5, %%mm3 \n\t" /* r3 = B = c7*i1 - c1*i7 */ \ 165 "movq "J(6)", %%mm5 \n\t" \ 166 "paddsw %%mm7, %%mm0 \n\t" /* r0 = A = c1*i1 + c7*i7 */ \ 167 "movq %%mm5, %%mm7 \n\t" /* r7 = i6 */ \ 168 "psubsw %%mm4, %%mm0 \n\t" /* r0 = A - C */ \ 169 "pmulhw "C(2)", %%mm5 \n\t" /* r5 = c2*i6 - i6 */ \ 170 "paddw %%mm1, %%mm2 \n\t" /* r2 = c2*i2 */ \ 171 "pmulhw "C(6)", %%mm1 \n\t" /* r1 = c6*i2 */ \ 172 "paddsw %%mm4, %%mm4 \n\t" /* r4 = C + C */ \ 173 "paddsw %%mm0, %%mm4 \n\t" /* r4 = C. = A + C */ \ 174 "psubsw %%mm6, %%mm3 \n\t" /* r3 = B - D */ \ 175 "paddw %%mm7, %%mm5 \n\t" /* r5 = c2*i6 */ \ 176 "paddsw %%mm6, %%mm6 \n\t" /* r6 = D + D */ \ 177 "pmulhw "C(6)", %%mm7 \n\t" /* r7 = c6*i6 */ \ 178 "paddsw %%mm3, %%mm6 \n\t" /* r6 = D. = B + D */ \ 179 "movq %%mm4, "I(1)"\n\t" /* save C. at I(1) */ \ 180 "psubsw %%mm5, %%mm1 \n\t" /* r1 = H = c6*i2 - c2*i6 */ \ 181 "movq "C(4)", %%mm4 \n\t" \ 182 "movq %%mm3, %%mm5 \n\t" /* r5 = B - D */ \ 183 "pmulhw %%mm4, %%mm3 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ 184 "paddsw %%mm2, %%mm7 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ 185 "movq %%mm6, "I(2)"\n\t" /* save D. at I(2) */ \ 186 "movq %%mm0, %%mm2 \n\t" /* r2 = A - C */ \ 187 "movq "I(0)", %%mm6 \n\t" \ 188 "pmulhw %%mm4, %%mm0 \n\t" /* r0 = (c4 - 1) * (A - C) */ \ 189 "paddw %%mm3, %%mm5 \n\t" /* r5 = B. = c4 * (B - D) */ \ 190 "movq "J(4)", %%mm3 \n\t" \ 191 "psubsw %%mm1, %%mm5 \n\t" /* r5 = B.. = B. - H */ \ 192 "paddw %%mm0, %%mm2 \n\t" /* r0 = A. = c4 * (A - C) */ \ 193 "psubsw %%mm3, %%mm6 \n\t" /* r6 = i0 - i4 */ \ 194 "movq %%mm6, %%mm0 \n\t" \ 195 "pmulhw %%mm4, %%mm6 \n\t" /* r6 = (c4 - 1) * (i0 - i4) */ \ 196 "paddsw %%mm3, %%mm3 \n\t" /* r3 = i4 + i4 */ \ 197 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H + H */ \ 198 "paddsw %%mm0, %%mm3 \n\t" /* r3 = i0 + i4 */ \ 199 "paddsw %%mm5, %%mm1 \n\t" /* r1 = H. = B + H */ \ 200 "pmulhw %%mm3, %%mm4 \n\t" /* r4 = (c4 - 1) * (i0 + i4) */ \ 201 "paddsw %%mm0, %%mm6 \n\t" /* r6 = F = c4 * (i0 - i4) */ \ 202 "psubsw %%mm2, %%mm6 \n\t" /* r6 = F. = F - A. */ \ 203 "paddsw %%mm2, %%mm2 \n\t" /* r2 = A. + A. */ \ 204 "movq "I(1)", %%mm0 \n\t" /* r0 = C. */ \ 205 "paddsw %%mm6, %%mm2 \n\t" /* r2 = A.. = F + A. */ \ 206 "paddw %%mm3, %%mm4 \n\t" /* r4 = E = c4 * (i0 + i4) */ \ 207 "psubsw %%mm1, %%mm2 \n\t" /* r2 = R2 = A.. - H. */ 208 209/* RowIDCT gets ready to transpose */ 210#define RowIDCT() \ 211 BeginIDCT() \ 212 "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ 213 "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ 214 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ 215 "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ 216 "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ 217 "paddsw %%mm4, %%mm7 \n\t" /* r1 = R1 = A.. + H. */ \ 218 "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ 219 "paddsw %%mm3, %%mm3 \n\t" \ 220 "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ 221 "paddsw %%mm5, %%mm5 \n\t" \ 222 "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ 223 "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ 224 "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ 225 "paddsw %%mm0, %%mm0 \n\t" \ 226 "movq %%mm1, "I(1)"\n\t" /* save R1 */ \ 227 "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ 228 229/* Column IDCT normalizes and stores final results */ 230#define ColumnIDCT() \ 231 BeginIDCT() \ 232 "paddsw "OC_8", %%mm2 \n\t" /* adjust R2 (and R1) for shift */ \ 233 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ 234 "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ 235 "psraw $4, %%mm2 \n\t" /* r2 = NR2 */ \ 236 "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ 237 "psraw $4, %%mm1 \n\t" /* r1 = NR1 */ \ 238 "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ 239 "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ 240 "movq %%mm2, "I(2)"\n\t" /* store NR2 at I2 */ \ 241 "paddsw %%mm4, %%mm7 \n\t" /* r7 = G. = E + G */ \ 242 "movq %%mm1, "I(1)"\n\t" /* store NR1 at I1 */ \ 243 "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ 244 "paddsw "OC_8", %%mm4 \n\t" /* adjust R4 (and R3) for shift */ \ 245 "paddsw %%mm3, %%mm3 \n\t" /* r3 = D. + D. */ \ 246 "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ 247 "psraw $4, %%mm4 \n\t" /* r4 = NR4 */ \ 248 "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ 249 "psraw $4, %%mm3 \n\t" /* r3 = NR3 */ \ 250 "paddsw "OC_8", %%mm6 \n\t" /* adjust R6 (and R5) for shift */ \ 251 "paddsw %%mm5, %%mm5 \n\t" /* r5 = B.. + B.. */ \ 252 "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ 253 "psraw $4, %%mm6 \n\t" /* r6 = NR6 */ \ 254 "movq %%mm4, "J(4)"\n\t" /* store NR4 at J4 */ \ 255 "psraw $4, %%mm5 \n\t" /* r5 = NR5 */ \ 256 "movq %%mm3, "I(3)"\n\t" /* store NR3 at I3 */ \ 257 "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ 258 "paddsw "OC_8", %%mm7 \n\t" /* adjust R7 (and R0) for shift */ \ 259 "paddsw %%mm0, %%mm0 \n\t" /* r0 = C. + C. */ \ 260 "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ \ 261 "psraw $4, %%mm7 \n\t" /* r7 = NR7 */ \ 262 "movq %%mm6, "J(6)"\n\t" /* store NR6 at J6 */ \ 263 "psraw $4, %%mm0 \n\t" /* r0 = NR0 */ \ 264 "movq %%mm5, "J(5)"\n\t" /* store NR5 at J5 */ \ 265 "movq %%mm7, "J(7)"\n\t" /* store NR7 at J7 */ \ 266 "movq %%mm0, "I(0)"\n\t" /* store NR0 at I0 */ 267 268/* Following macro does two 4x4 transposes in place. 269 270 At entry (we assume): 271 272 r0 = a3 a2 a1 a0 273 I(1) = b3 b2 b1 b0 274 r2 = c3 c2 c1 c0 275 r3 = d3 d2 d1 d0 276 277 r4 = e3 e2 e1 e0 278 r5 = f3 f2 f1 f0 279 r6 = g3 g2 g1 g0 280 r7 = h3 h2 h1 h0 281 282 At exit, we have: 283 284 I(0) = d0 c0 b0 a0 285 I(1) = d1 c1 b1 a1 286 I(2) = d2 c2 b2 a2 287 I(3) = d3 c3 b3 a3 288 289 J(4) = h0 g0 f0 e0 290 J(5) = h1 g1 f1 e1 291 J(6) = h2 g2 f2 e2 292 J(7) = h3 g3 f3 e3 293 294 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. 295 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. 296 297 Since r1 is free at entry, we calculate the Js first. */ 298#define Transpose() \ 299 "movq %%mm4, %%mm1 \n\t" /* r1 = e3 e2 e1 e0 */ \ 300 "punpcklwd %%mm5, %%mm4 \n\t" /* r4 = f1 e1 f0 e0 */ \ 301 "movq %%mm0, "I(0)"\n\t" /* save a3 a2 a1 a0 */ \ 302 "punpckhwd %%mm5, %%mm1 \n\t" /* r1 = f3 e3 f2 e2 */ \ 303 "movq %%mm6, %%mm0 \n\t" /* r0 = g3 g2 g1 g0 */ \ 304 "punpcklwd %%mm7, %%mm6 \n\t" /* r6 = h1 g1 h0 g0 */ \ 305 "movq %%mm4, %%mm5 \n\t" /* r5 = f1 e1 f0 e0 */ \ 306 "punpckldq %%mm6, %%mm4 \n\t" /* r4 = h0 g0 f0 e0 = R4 */ \ 307 "punpckhdq %%mm6, %%mm5 \n\t" /* r5 = h1 g1 f1 e1 = R5 */ \ 308 "movq %%mm1, %%mm6 \n\t" /* r6 = f3 e3 f2 e2 */ \ 309 "movq %%mm4, "J(4)"\n\t" \ 310 "punpckhwd %%mm7, %%mm0 \n\t" /* r0 = h3 g3 h2 g2 */ \ 311 "movq %%mm5, "J(5)"\n\t" \ 312 "punpckhdq %%mm0, %%mm6 \n\t" /* r6 = h3 g3 f3 e3 = R7 */ \ 313 "movq "I(0)", %%mm4 \n\t" /* r4 = a3 a2 a1 a0 */ \ 314 "punpckldq %%mm0, %%mm1 \n\t" /* r1 = h2 g2 f2 e2 = R6 */ \ 315 "movq "I(1)", %%mm5 \n\t" /* r5 = b3 b2 b1 b0 */ \ 316 "movq %%mm4, %%mm0 \n\t" /* r0 = a3 a2 a1 a0 */ \ 317 "movq %%mm6, "J(7)"\n\t" \ 318 "punpcklwd %%mm5, %%mm0 \n\t" /* r0 = b1 a1 b0 a0 */ \ 319 "movq %%mm1, "J(6)"\n\t" \ 320 "punpckhwd %%mm5, %%mm4 \n\t" /* r4 = b3 a3 b2 a2 */ \ 321 "movq %%mm2, %%mm5 \n\t" /* r5 = c3 c2 c1 c0 */ \ 322 "punpcklwd %%mm3, %%mm2 \n\t" /* r2 = d1 c1 d0 c0 */ \ 323 "movq %%mm0, %%mm1 \n\t" /* r1 = b1 a1 b0 a0 */ \ 324 "punpckldq %%mm2, %%mm0 \n\t" /* r0 = d0 c0 b0 a0 = R0 */ \ 325 "punpckhdq %%mm2, %%mm1 \n\t" /* r1 = d1 c1 b1 a1 = R1 */ \ 326 "movq %%mm4, %%mm2 \n\t" /* r2 = b3 a3 b2 a2 */ \ 327 "movq %%mm0, "I(0)"\n\t" \ 328 "punpckhwd %%mm3, %%mm5 \n\t" /* r5 = d3 c3 d2 c2 */ \ 329 "movq %%mm1, "I(1)"\n\t" \ 330 "punpckhdq %%mm5, %%mm4 \n\t" /* r4 = d3 c3 b3 a3 = R3 */ \ 331 "punpckldq %%mm5, %%mm2 \n\t" /* r2 = d2 c2 b2 a2 = R2 */ \ 332 "movq %%mm4, "I(3)"\n\t" \ 333 "movq %%mm2, "I(2)"\n\t" 334 335void ff_vp3_idct_mmx(int16_t *output_data) 336{ 337 /* eax = quantized input 338 * ebx = dequantizer matrix 339 * ecx = IDCT constants 340 * M(I) = ecx + MaskOffset(0) + I * 8 341 * C(I) = ecx + CosineOffset(32) + (I-1) * 8 342 * edx = output 343 * r0..r7 = mm0..mm7 344 */ 345 346#define C(x) AV_STRINGIFY(16*(x-1))"(%1)" 347#define OC_8 "%2" 348 349 /* at this point, function has completed dequantization + dezigzag + 350 * partial transposition; now do the idct itself */ 351#define I(x) AV_STRINGIFY(16* x )"(%0)" 352#define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)" 353 354 __asm__ volatile ( 355 RowIDCT() 356 Transpose() 357 358#undef I 359#undef J 360#define I(x) AV_STRINGIFY(16* x + 64)"(%0)" 361#define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)" 362 363 RowIDCT() 364 Transpose() 365 366#undef I 367#undef J 368#define I(x) AV_STRINGIFY(16*x)"(%0)" 369#define J(x) AV_STRINGIFY(16*x)"(%0)" 370 371 ColumnIDCT() 372 373#undef I 374#undef J 375#define I(x) AV_STRINGIFY(16*x + 8)"(%0)" 376#define J(x) AV_STRINGIFY(16*x + 8)"(%0)" 377 378 ColumnIDCT() 379 :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8) 380 ); 381#undef I 382#undef J 383 384} 385 386void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) 387{ 388 ff_vp3_idct_mmx(block); 389 put_signed_pixels_clamped_mmx(block, dest, line_size); 390} 391 392void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) 393{ 394 ff_vp3_idct_mmx(block); 395 add_pixels_clamped_mmx(block, dest, line_size); 396} 397