1/* 2 * Copyright (C) 2004 the ffmpeg project 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21/** 22 * @file 23 * MMX-optimized functions cribbed from the original VP3 source code. 24 */ 25 26#include "libavutil/x86_cpu.h" 27#include "libavcodec/dsputil.h" 28#include "dsputil_mmx.h" 29#include "vp3dsp_mmx.h" 30 31extern const uint16_t ff_vp3_idct_data[]; 32 33// this is off by one or two for some cases when filter_limit is greater than 63 34// in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 35// out: p1 in mm4, p2 in mm3 36#define VP3_LOOP_FILTER(flim) \ 37 "movq %%mm6, %%mm7 \n\t" \ 38 "pand "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \ 39 "psrlw $3, %%mm7 \n\t" \ 40 "pand "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \ 41 "movq %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \ 42 "pxor %%mm4, %%mm2 \n\t" \ 43 "pand "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \ 44 "movq %%mm2, %%mm5 \n\t" \ 45 "paddb %%mm2, %%mm2 \n\t" \ 46 "paddb %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \ 47 "paddb %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \ 48 "pcmpeqb %%mm0, %%mm0 \n\t" \ 49 "pxor %%mm0, %%mm1 \n\t" /* 255 - p3 */ \ 50 "pavgb %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \ 51 "pxor %%mm4, %%mm0 \n\t" /* 255 - p1 */ \ 52 "pavgb %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \ 53 "paddb "MANGLE(ff_pb_3 )", %%mm1 \n\t" \ 54 "pavgb %%mm0, %%mm1 \n\t" /* 128+2+( p2-p1 - p3) >> 2 */ \ 55 "pavgb %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \ 56 "paddusb %%mm1, %%mm7 \n\t" /* d+128+1 */ \ 57 "movq "MANGLE(ff_pb_81)", %%mm6 \n\t" \ 58 "psubusb %%mm7, %%mm6 \n\t" \ 59 "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \ 60\ 61 "movq "#flim", %%mm5 \n\t" \ 62 "pminub %%mm5, %%mm6 \n\t" \ 63 "pminub %%mm5, %%mm7 \n\t" \ 64 "movq %%mm6, %%mm0 \n\t" \ 65 "movq %%mm7, %%mm1 \n\t" \ 66 "paddb %%mm6, %%mm6 \n\t" \ 67 "paddb %%mm7, %%mm7 \n\t" \ 68 "pminub %%mm5, %%mm6 \n\t" \ 69 "pminub %%mm5, %%mm7 \n\t" \ 70 "psubb %%mm0, %%mm6 \n\t" \ 71 "psubb %%mm1, %%mm7 \n\t" \ 72 "paddusb %%mm7, %%mm4 \n\t" \ 73 "psubusb %%mm6, %%mm4 \n\t" \ 74 "psubusb %%mm7, %%mm3 \n\t" \ 75 "paddusb %%mm6, %%mm3 \n\t" 76 77#define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \ 78 "movd "#mm", %0 \n\t" \ 79 "movw %w0, -1"#dst0" \n\t" \ 80 "psrlq $32, "#mm" \n\t" \ 81 "shr $16, %0 \n\t" \ 82 "movw %w0, -1"#dst1" \n\t" \ 83 "movd "#mm", %0 \n\t" \ 84 "movw %w0, -1"#dst2" \n\t" \ 85 "shr $16, %0 \n\t" \ 86 "movw %w0, -1"#dst3" \n\t" 87 88void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) 89{ 90 __asm__ volatile( 91 "movq %0, %%mm6 \n\t" 92 "movq %1, %%mm4 \n\t" 93 "movq %2, %%mm2 \n\t" 94 "movq %3, %%mm1 \n\t" 95 96 VP3_LOOP_FILTER(%4) 97 98 "movq %%mm4, %1 \n\t" 99 "movq %%mm3, %2 \n\t" 100 101 : "+m" (*(uint64_t*)(src - 2*stride)), 102 "+m" (*(uint64_t*)(src - 1*stride)), 103 "+m" (*(uint64_t*)(src + 0*stride)), 104 "+m" (*(uint64_t*)(src + 1*stride)) 105 : "m"(*(uint64_t*)(bounding_values+129)) 106 ); 107} 108 109void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) 110{ 111 x86_reg tmp; 112 113 __asm__ volatile( 114 "movd -2(%1), %%mm6 \n\t" 115 "movd -2(%1,%3), %%mm0 \n\t" 116 "movd -2(%1,%3,2), %%mm1 \n\t" 117 "movd -2(%1,%4), %%mm4 \n\t" 118 119 TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2) 120 VP3_LOOP_FILTER(%5) 121 SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q) 122 123 STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4) 124 STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5) 125 126 : "=&r"(tmp) 127 : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride), 128 "m"(*(uint64_t*)(bounding_values+129)) 129 : "memory" 130 ); 131} 132 133/* from original comments: The Macro does IDct on 4 1-D Dcts */ 134#define BeginIDCT() \ 135 "movq "I(3)", %%mm2 \n\t" \ 136 "movq "C(3)", %%mm6 \n\t" \ 137 "movq %%mm2, %%mm4 \n\t" \ 138 "movq "J(5)", %%mm7 \n\t" \ 139 "pmulhw %%mm6, %%mm4 \n\t" /* r4 = c3*i3 - i3 */ \ 140 "movq "C(5)", %%mm1 \n\t" \ 141 "pmulhw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 - i5 */ \ 142 "movq %%mm1, %%mm5 \n\t" \ 143 "pmulhw %%mm2, %%mm1 \n\t" /* r1 = c5*i3 - i3 */ \ 144 "movq "I(1)", %%mm3 \n\t" \ 145 "pmulhw %%mm7, %%mm5 \n\t" /* r5 = c5*i5 - i5 */ \ 146 "movq "C(1)", %%mm0 \n\t" \ 147 "paddw %%mm2, %%mm4 \n\t" /* r4 = c3*i3 */ \ 148 "paddw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 */ \ 149 "paddw %%mm1, %%mm2 \n\t" /* r2 = c5*i3 */ \ 150 "movq "J(7)", %%mm1 \n\t" \ 151 "paddw %%mm5, %%mm7 \n\t" /* r7 = c5*i5 */ \ 152 "movq %%mm0, %%mm5 \n\t" /* r5 = c1 */ \ 153 "pmulhw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 - i1 */ \ 154 "paddsw %%mm7, %%mm4 \n\t" /* r4 = C = c3*i3 + c5*i5 */ \ 155 "pmulhw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 - i7 */ \ 156 "movq "C(7)", %%mm7 \n\t" \ 157 "psubsw %%mm2, %%mm6 \n\t" /* r6 = D = c3*i5 - c5*i3 */ \ 158 "paddw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 */ \ 159 "pmulhw %%mm7, %%mm3 \n\t" /* r3 = c7*i1 */ \ 160 "movq "I(2)", %%mm2 \n\t" \ 161 "pmulhw %%mm1, %%mm7 \n\t" /* r7 = c7*i7 */ \ 162 "paddw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 */ \ 163 "movq %%mm2, %%mm1 \n\t" /* r1 = i2 */ \ 164 "pmulhw "C(2)", %%mm2 \n\t" /* r2 = c2*i2 - i2 */ \ 165 "psubsw %%mm5, %%mm3 \n\t" /* r3 = B = c7*i1 - c1*i7 */ \ 166 "movq "J(6)", %%mm5 \n\t" \ 167 "paddsw %%mm7, %%mm0 \n\t" /* r0 = A = c1*i1 + c7*i7 */ \ 168 "movq %%mm5, %%mm7 \n\t" /* r7 = i6 */ \ 169 "psubsw %%mm4, %%mm0 \n\t" /* r0 = A - C */ \ 170 "pmulhw "C(2)", %%mm5 \n\t" /* r5 = c2*i6 - i6 */ \ 171 "paddw %%mm1, %%mm2 \n\t" /* r2 = c2*i2 */ \ 172 "pmulhw "C(6)", %%mm1 \n\t" /* r1 = c6*i2 */ \ 173 "paddsw %%mm4, %%mm4 \n\t" /* r4 = C + C */ \ 174 "paddsw %%mm0, %%mm4 \n\t" /* r4 = C. = A + C */ \ 175 "psubsw %%mm6, %%mm3 \n\t" /* r3 = B - D */ \ 176 "paddw %%mm7, %%mm5 \n\t" /* r5 = c2*i6 */ \ 177 "paddsw %%mm6, %%mm6 \n\t" /* r6 = D + D */ \ 178 "pmulhw "C(6)", %%mm7 \n\t" /* r7 = c6*i6 */ \ 179 "paddsw %%mm3, %%mm6 \n\t" /* r6 = D. = B + D */ \ 180 "movq %%mm4, "I(1)"\n\t" /* save C. at I(1) */ \ 181 "psubsw %%mm5, %%mm1 \n\t" /* r1 = H = c6*i2 - c2*i6 */ \ 182 "movq "C(4)", %%mm4 \n\t" \ 183 "movq %%mm3, %%mm5 \n\t" /* r5 = B - D */ \ 184 "pmulhw %%mm4, %%mm3 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ 185 "paddsw %%mm2, %%mm7 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ 186 "movq %%mm6, "I(2)"\n\t" /* save D. at I(2) */ \ 187 "movq %%mm0, %%mm2 \n\t" /* r2 = A - C */ \ 188 "movq "I(0)", %%mm6 \n\t" \ 189 "pmulhw %%mm4, %%mm0 \n\t" /* r0 = (c4 - 1) * (A - C) */ \ 190 "paddw %%mm3, %%mm5 \n\t" /* r5 = B. = c4 * (B - D) */ \ 191 "movq "J(4)", %%mm3 \n\t" \ 192 "psubsw %%mm1, %%mm5 \n\t" /* r5 = B.. = B. - H */ \ 193 "paddw %%mm0, %%mm2 \n\t" /* r0 = A. = c4 * (A - C) */ \ 194 "psubsw %%mm3, %%mm6 \n\t" /* r6 = i0 - i4 */ \ 195 "movq %%mm6, %%mm0 \n\t" \ 196 "pmulhw %%mm4, %%mm6 \n\t" /* r6 = (c4 - 1) * (i0 - i4) */ \ 197 "paddsw %%mm3, %%mm3 \n\t" /* r3 = i4 + i4 */ \ 198 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H + H */ \ 199 "paddsw %%mm0, %%mm3 \n\t" /* r3 = i0 + i4 */ \ 200 "paddsw %%mm5, %%mm1 \n\t" /* r1 = H. = B + H */ \ 201 "pmulhw %%mm3, %%mm4 \n\t" /* r4 = (c4 - 1) * (i0 + i4) */ \ 202 "paddsw %%mm0, %%mm6 \n\t" /* r6 = F = c4 * (i0 - i4) */ \ 203 "psubsw %%mm2, %%mm6 \n\t" /* r6 = F. = F - A. */ \ 204 "paddsw %%mm2, %%mm2 \n\t" /* r2 = A. + A. */ \ 205 "movq "I(1)", %%mm0 \n\t" /* r0 = C. */ \ 206 "paddsw %%mm6, %%mm2 \n\t" /* r2 = A.. = F + A. */ \ 207 "paddw %%mm3, %%mm4 \n\t" /* r4 = E = c4 * (i0 + i4) */ \ 208 "psubsw %%mm1, %%mm2 \n\t" /* r2 = R2 = A.. - H. */ 209 210/* RowIDCT gets ready to transpose */ 211#define RowIDCT() \ 212 BeginIDCT() \ 213 "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ 214 "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ 215 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ 216 "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ 217 "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ 218 "paddsw %%mm4, %%mm7 \n\t" /* r1 = R1 = A.. + H. */ \ 219 "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ 220 "paddsw %%mm3, %%mm3 \n\t" \ 221 "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ 222 "paddsw %%mm5, %%mm5 \n\t" \ 223 "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ 224 "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ 225 "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ 226 "paddsw %%mm0, %%mm0 \n\t" \ 227 "movq %%mm1, "I(1)"\n\t" /* save R1 */ \ 228 "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ 229 230/* Column IDCT normalizes and stores final results */ 231#define ColumnIDCT() \ 232 BeginIDCT() \ 233 "paddsw "OC_8", %%mm2 \n\t" /* adjust R2 (and R1) for shift */ \ 234 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ 235 "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ 236 "psraw $4, %%mm2 \n\t" /* r2 = NR2 */ \ 237 "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ 238 "psraw $4, %%mm1 \n\t" /* r1 = NR1 */ \ 239 "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ 240 "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ 241 "movq %%mm2, "I(2)"\n\t" /* store NR2 at I2 */ \ 242 "paddsw %%mm4, %%mm7 \n\t" /* r7 = G. = E + G */ \ 243 "movq %%mm1, "I(1)"\n\t" /* store NR1 at I1 */ \ 244 "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ 245 "paddsw "OC_8", %%mm4 \n\t" /* adjust R4 (and R3) for shift */ \ 246 "paddsw %%mm3, %%mm3 \n\t" /* r3 = D. + D. */ \ 247 "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ 248 "psraw $4, %%mm4 \n\t" /* r4 = NR4 */ \ 249 "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ 250 "psraw $4, %%mm3 \n\t" /* r3 = NR3 */ \ 251 "paddsw "OC_8", %%mm6 \n\t" /* adjust R6 (and R5) for shift */ \ 252 "paddsw %%mm5, %%mm5 \n\t" /* r5 = B.. + B.. */ \ 253 "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ 254 "psraw $4, %%mm6 \n\t" /* r6 = NR6 */ \ 255 "movq %%mm4, "J(4)"\n\t" /* store NR4 at J4 */ \ 256 "psraw $4, %%mm5 \n\t" /* r5 = NR5 */ \ 257 "movq %%mm3, "I(3)"\n\t" /* store NR3 at I3 */ \ 258 "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ 259 "paddsw "OC_8", %%mm7 \n\t" /* adjust R7 (and R0) for shift */ \ 260 "paddsw %%mm0, %%mm0 \n\t" /* r0 = C. + C. */ \ 261 "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ \ 262 "psraw $4, %%mm7 \n\t" /* r7 = NR7 */ \ 263 "movq %%mm6, "J(6)"\n\t" /* store NR6 at J6 */ \ 264 "psraw $4, %%mm0 \n\t" /* r0 = NR0 */ \ 265 "movq %%mm5, "J(5)"\n\t" /* store NR5 at J5 */ \ 266 "movq %%mm7, "J(7)"\n\t" /* store NR7 at J7 */ \ 267 "movq %%mm0, "I(0)"\n\t" /* store NR0 at I0 */ 268 269/* Following macro does two 4x4 transposes in place. 270 271 At entry (we assume): 272 273 r0 = a3 a2 a1 a0 274 I(1) = b3 b2 b1 b0 275 r2 = c3 c2 c1 c0 276 r3 = d3 d2 d1 d0 277 278 r4 = e3 e2 e1 e0 279 r5 = f3 f2 f1 f0 280 r6 = g3 g2 g1 g0 281 r7 = h3 h2 h1 h0 282 283 At exit, we have: 284 285 I(0) = d0 c0 b0 a0 286 I(1) = d1 c1 b1 a1 287 I(2) = d2 c2 b2 a2 288 I(3) = d3 c3 b3 a3 289 290 J(4) = h0 g0 f0 e0 291 J(5) = h1 g1 f1 e1 292 J(6) = h2 g2 f2 e2 293 J(7) = h3 g3 f3 e3 294 295 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. 296 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. 297 298 Since r1 is free at entry, we calculate the Js first. */ 299#define Transpose() \ 300 "movq %%mm4, %%mm1 \n\t" /* r1 = e3 e2 e1 e0 */ \ 301 "punpcklwd %%mm5, %%mm4 \n\t" /* r4 = f1 e1 f0 e0 */ \ 302 "movq %%mm0, "I(0)"\n\t" /* save a3 a2 a1 a0 */ \ 303 "punpckhwd %%mm5, %%mm1 \n\t" /* r1 = f3 e3 f2 e2 */ \ 304 "movq %%mm6, %%mm0 \n\t" /* r0 = g3 g2 g1 g0 */ \ 305 "punpcklwd %%mm7, %%mm6 \n\t" /* r6 = h1 g1 h0 g0 */ \ 306 "movq %%mm4, %%mm5 \n\t" /* r5 = f1 e1 f0 e0 */ \ 307 "punpckldq %%mm6, %%mm4 \n\t" /* r4 = h0 g0 f0 e0 = R4 */ \ 308 "punpckhdq %%mm6, %%mm5 \n\t" /* r5 = h1 g1 f1 e1 = R5 */ \ 309 "movq %%mm1, %%mm6 \n\t" /* r6 = f3 e3 f2 e2 */ \ 310 "movq %%mm4, "J(4)"\n\t" \ 311 "punpckhwd %%mm7, %%mm0 \n\t" /* r0 = h3 g3 h2 g2 */ \ 312 "movq %%mm5, "J(5)"\n\t" \ 313 "punpckhdq %%mm0, %%mm6 \n\t" /* r6 = h3 g3 f3 e3 = R7 */ \ 314 "movq "I(0)", %%mm4 \n\t" /* r4 = a3 a2 a1 a0 */ \ 315 "punpckldq %%mm0, %%mm1 \n\t" /* r1 = h2 g2 f2 e2 = R6 */ \ 316 "movq "I(1)", %%mm5 \n\t" /* r5 = b3 b2 b1 b0 */ \ 317 "movq %%mm4, %%mm0 \n\t" /* r0 = a3 a2 a1 a0 */ \ 318 "movq %%mm6, "J(7)"\n\t" \ 319 "punpcklwd %%mm5, %%mm0 \n\t" /* r0 = b1 a1 b0 a0 */ \ 320 "movq %%mm1, "J(6)"\n\t" \ 321 "punpckhwd %%mm5, %%mm4 \n\t" /* r4 = b3 a3 b2 a2 */ \ 322 "movq %%mm2, %%mm5 \n\t" /* r5 = c3 c2 c1 c0 */ \ 323 "punpcklwd %%mm3, %%mm2 \n\t" /* r2 = d1 c1 d0 c0 */ \ 324 "movq %%mm0, %%mm1 \n\t" /* r1 = b1 a1 b0 a0 */ \ 325 "punpckldq %%mm2, %%mm0 \n\t" /* r0 = d0 c0 b0 a0 = R0 */ \ 326 "punpckhdq %%mm2, %%mm1 \n\t" /* r1 = d1 c1 b1 a1 = R1 */ \ 327 "movq %%mm4, %%mm2 \n\t" /* r2 = b3 a3 b2 a2 */ \ 328 "movq %%mm0, "I(0)"\n\t" \ 329 "punpckhwd %%mm3, %%mm5 \n\t" /* r5 = d3 c3 d2 c2 */ \ 330 "movq %%mm1, "I(1)"\n\t" \ 331 "punpckhdq %%mm5, %%mm4 \n\t" /* r4 = d3 c3 b3 a3 = R3 */ \ 332 "punpckldq %%mm5, %%mm2 \n\t" /* r2 = d2 c2 b2 a2 = R2 */ \ 333 "movq %%mm4, "I(3)"\n\t" \ 334 "movq %%mm2, "I(2)"\n\t" 335 336void ff_vp3_idct_mmx(int16_t *output_data) 337{ 338 /* eax = quantized input 339 * ebx = dequantizer matrix 340 * ecx = IDCT constants 341 * M(I) = ecx + MaskOffset(0) + I * 8 342 * C(I) = ecx + CosineOffset(32) + (I-1) * 8 343 * edx = output 344 * r0..r7 = mm0..mm7 345 */ 346 347#define C(x) AV_STRINGIFY(16*(x-1))"(%1)" 348#define OC_8 "%2" 349 350 /* at this point, function has completed dequantization + dezigzag + 351 * partial transposition; now do the idct itself */ 352#define I(x) AV_STRINGIFY(16* x )"(%0)" 353#define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)" 354 355 __asm__ volatile ( 356 RowIDCT() 357 Transpose() 358 359#undef I 360#undef J 361#define I(x) AV_STRINGIFY(16* x + 64)"(%0)" 362#define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)" 363 364 RowIDCT() 365 Transpose() 366 367#undef I 368#undef J 369#define I(x) AV_STRINGIFY(16*x)"(%0)" 370#define J(x) AV_STRINGIFY(16*x)"(%0)" 371 372 ColumnIDCT() 373 374#undef I 375#undef J 376#define I(x) AV_STRINGIFY(16*x + 8)"(%0)" 377#define J(x) AV_STRINGIFY(16*x + 8)"(%0)" 378 379 ColumnIDCT() 380 :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8) 381 ); 382#undef I 383#undef J 384 385} 386 387void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) 388{ 389 ff_vp3_idct_mmx(block); 390 put_signed_pixels_clamped_mmx(block, dest, line_size); 391} 392 393void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) 394{ 395 ff_vp3_idct_mmx(block); 396 add_pixels_clamped_mmx(block, dest, line_size); 397} 398 399void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block) 400{ 401 int dc = block[0]; 402 dc = (46341*dc)>>16; 403 dc = (46341*dc + (8<<16))>>20; 404 405 __asm__ volatile( 406 "movd %3, %%mm0 \n\t" 407 "pshufw $0, %%mm0, %%mm0 \n\t" 408 "pxor %%mm1, %%mm1 \n\t" 409 "psubw %%mm0, %%mm1 \n\t" 410 "packuswb %%mm0, %%mm0 \n\t" 411 "packuswb %%mm1, %%mm1 \n\t" 412 413#define DC_ADD \ 414 "movq (%0), %%mm2 \n\t" \ 415 "movq (%0,%1), %%mm3 \n\t" \ 416 "paddusb %%mm0, %%mm2 \n\t" \ 417 "movq (%0,%1,2), %%mm4 \n\t" \ 418 "paddusb %%mm0, %%mm3 \n\t" \ 419 "movq (%0,%2), %%mm5 \n\t" \ 420 "paddusb %%mm0, %%mm4 \n\t" \ 421 "paddusb %%mm0, %%mm5 \n\t" \ 422 "psubusb %%mm1, %%mm2 \n\t" \ 423 "psubusb %%mm1, %%mm3 \n\t" \ 424 "movq %%mm2, (%0) \n\t" \ 425 "psubusb %%mm1, %%mm4 \n\t" \ 426 "movq %%mm3, (%0,%1) \n\t" \ 427 "psubusb %%mm1, %%mm5 \n\t" \ 428 "movq %%mm4, (%0,%1,2) \n\t" \ 429 "movq %%mm5, (%0,%2) \n\t" 430 431 DC_ADD 432 "lea (%0,%1,4), %0 \n\t" 433 DC_ADD 434 435 : "+r"(dest) 436 : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc) 437 ); 438} 439