1;****************************************************************************** 2;* MMX/SSE2-optimized functions for the VP3 decoder 3;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24; MMX-optimized functions cribbed from the original VP3 source code. 25 26SECTION_RODATA 27 28vp3_idct_data: times 8 dw 64277 29 times 8 dw 60547 30 times 8 dw 54491 31 times 8 dw 46341 32 times 8 dw 36410 33 times 8 dw 25080 34 times 8 dw 12785 35 36pb_7: times 8 db 0x07 37pb_1F: times 8 db 0x1f 38pb_81: times 8 db 0x81 39 40cextern pb_1 41cextern pb_3 42cextern pb_80 43 44cextern pw_8 45 46SECTION .text 47 48; this is off by one or two for some cases when filter_limit is greater than 63 49; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 50; out: p1 in mm4, p2 in mm3 51%macro VP3_LOOP_FILTER 0 52 movq m7, m6 53 pand m6, [pb_7] ; p0&7 54 psrlw m7, 3 55 pand m7, [pb_1F] ; p0>>3 56 movq m3, m2 ; p2 57 pxor m2, m4 58 pand m2, [pb_1] ; (p2^p1)&1 59 movq m5, m2 60 paddb m2, m2 61 paddb m2, m5 ; 3*(p2^p1)&1 62 paddb m2, m6 ; extra bits lost in shifts 63 pcmpeqb m0, m0 64 pxor m1, m0 ; 255 - p3 65 pavgb m1, m2 ; (256 - p3 + extrabits) >> 1 66 pxor m0, m4 ; 255 - p1 67 pavgb m0, m3 ; (256 + p2-p1) >> 1 68 paddb m1, [pb_3] 69 pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2 70 pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3 71 paddusb m7, m1 ; d+128+1 72 movq m6, [pb_81] 73 psubusb m6, m7 74 psubusb m7, [pb_81] 75 76 movq m5, [r2+516] ; flim 77 pminub m6, m5 78 pminub m7, m5 79 movq m0, m6 80 movq m1, m7 81 paddb m6, m6 82 paddb m7, m7 83 pminub m6, m5 84 pminub m7, m5 85 psubb m6, m0 86 psubb m7, m1 87 paddusb m4, m7 88 psubusb m4, m6 89 psubusb m3, m7 90 paddusb m3, m6 91%endmacro 92 93%macro STORE_4_WORDS 1 94 movd r2d, %1 95 mov [r0 -1], r2w 96 psrlq %1, 32 97 shr r2, 16 98 mov [r0+r1 -1], r2w 99 movd r2d, %1 100 mov [r0+r1*2-1], r2w 101 shr r2, 16 102 mov [r0+r3 -1], r2w 103%endmacro 104 105INIT_MMX mmxext 106cglobal vp3_v_loop_filter, 3, 4 107%if ARCH_X86_64 108 movsxd r1, r1d 109%endif 110 mov r3, r1 111 neg r1 112 movq m6, [r0+r1*2] 113 movq m4, [r0+r1 ] 114 movq m2, [r0 ] 115 movq m1, [r0+r3 ] 116 117 VP3_LOOP_FILTER 118 119 movq [r0+r1], m4 120 movq [r0 ], m3 121 RET 122 123cglobal vp3_h_loop_filter, 3, 4 124%if ARCH_X86_64 125 movsxd r1, r1d 126%endif 127 lea r3, [r1*3] 128 129 movd m6, [r0 -2] 130 movd m4, [r0+r1 -2] 131 movd m2, [r0+r1*2-2] 132 movd m1, [r0+r3 -2] 133 lea r0, [r0+r1*4 ] 134 punpcklbw m6, [r0 -2] 135 punpcklbw m4, [r0+r1 -2] 136 punpcklbw m2, [r0+r1*2-2] 137 punpcklbw m1, [r0+r3 -2] 138 sub r0, r3 139 sub r0, r1 140 141 TRANSPOSE4x4B 6, 4, 2, 1, 0 142 VP3_LOOP_FILTER 143 SBUTTERFLY bw, 4, 3, 5 144 145 STORE_4_WORDS m4 146 lea r0, [r0+r1*4 ] 147 STORE_4_WORDS m3 148 RET 149 150; from original comments: The Macro does IDct on 4 1-D Dcts 151%macro BeginIDCT 0 152 movq m2, I(3) 153 movq m6, C(3) 154 movq m4, m2 155 movq m7, J(5) 156 pmulhw m4, m6 ; r4 = c3*i3 - i3 157 movq m1, C(5) 158 pmulhw m6, m7 ; r6 = c3*i5 - i5 159 movq m5, m1 160 pmulhw m1, m2 ; r1 = c5*i3 - i3 161 movq m3, I(1) 162 pmulhw m5, m7 ; r5 = c5*i5 - i5 163 movq m0, C(1) 164 paddw m4, m2 ; r4 = c3*i3 165 paddw m6, m7 ; r6 = c3*i5 166 paddw m2, m1 ; r2 = c5*i3 167 movq m1, J(7) 168 paddw m7, m5 ; r7 = c5*i5 169 movq m5, m0 ; r5 = c1 170 pmulhw m0, m3 ; r0 = c1*i1 - i1 171 paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5 172 pmulhw m5, m1 ; r5 = c1*i7 - i7 173 movq m7, C(7) 174 psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3 175 paddw m0, m3 ; r0 = c1*i1 176 pmulhw m3, m7 ; r3 = c7*i1 177 movq m2, I(2) 178 pmulhw m7, m1 ; r7 = c7*i7 179 paddw m5, m1 ; r5 = c1*i7 180 movq m1, m2 ; r1 = i2 181 pmulhw m2, C(2) ; r2 = c2*i2 - i2 182 psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7 183 movq m5, J(6) 184 paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7 185 movq m7, m5 ; r7 = i6 186 psubsw m0, m4 ; r0 = A - C 187 pmulhw m5, C(2) ; r5 = c2*i6 - i6 188 paddw m2, m1 ; r2 = c2*i2 189 pmulhw m1, C(6) ; r1 = c6*i2 190 paddsw m4, m4 ; r4 = C + C 191 paddsw m4, m0 ; r4 = C. = A + C 192 psubsw m3, m6 ; r3 = B - D 193 paddw m5, m7 ; r5 = c2*i6 194 paddsw m6, m6 ; r6 = D + D 195 pmulhw m7, C(6) ; r7 = c6*i6 196 paddsw m6, m3 ; r6 = D. = B + D 197 movq I(1), m4 ; save C. at I(1) 198 psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6 199 movq m4, C(4) 200 movq m5, m3 ; r5 = B - D 201 pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D) 202 paddsw m7, m2 ; r3 = (c4 - 1) * (B - D) 203 movq I(2), m6 ; save D. at I(2) 204 movq m2, m0 ; r2 = A - C 205 movq m6, I(0) 206 pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C) 207 paddw m5, m3 ; r5 = B. = c4 * (B - D) 208 movq m3, J(4) 209 psubsw m5, m1 ; r5 = B.. = B. - H 210 paddw m2, m0 ; r0 = A. = c4 * (A - C) 211 psubsw m6, m3 ; r6 = i0 - i4 212 movq m0, m6 213 pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4) 214 paddsw m3, m3 ; r3 = i4 + i4 215 paddsw m1, m1 ; r1 = H + H 216 paddsw m3, m0 ; r3 = i0 + i4 217 paddsw m1, m5 ; r1 = H. = B + H 218 pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4) 219 paddsw m6, m0 ; r6 = F = c4 * (i0 - i4) 220 psubsw m6, m2 ; r6 = F. = F - A. 221 paddsw m2, m2 ; r2 = A. + A. 222 movq m0, I(1) ; r0 = C. 223 paddsw m2, m6 ; r2 = A.. = F + A. 224 paddw m4, m3 ; r4 = E = c4 * (i0 + i4) 225 psubsw m2, m1 ; r2 = R2 = A.. - H. 226%endmacro 227 228; RowIDCT gets ready to transpose 229%macro RowIDCT 0 230 BeginIDCT 231 movq m3, I(2) ; r3 = D. 232 psubsw m4, m7 ; r4 = E. = E - G 233 paddsw m1, m1 ; r1 = H. + H. 234 paddsw m7, m7 ; r7 = G + G 235 paddsw m1, m2 ; r1 = R1 = A.. + H. 236 paddsw m7, m4 ; r1 = R1 = A.. + H. 237 psubsw m4, m3 ; r4 = R4 = E. - D. 238 paddsw m3, m3 239 psubsw m6, m5 ; r6 = R6 = F. - B.. 240 paddsw m5, m5 241 paddsw m3, m4 ; r3 = R3 = E. + D. 242 paddsw m5, m6 ; r5 = R5 = F. + B.. 243 psubsw m7, m0 ; r7 = R7 = G. - C. 244 paddsw m0, m0 245 movq I(1), m1 ; save R1 246 paddsw m0, m7 ; r0 = R0 = G. + C. 247%endmacro 248 249; Column IDCT normalizes and stores final results 250%macro ColumnIDCT 0 251 BeginIDCT 252 paddsw m2, OC_8 ; adjust R2 (and R1) for shift 253 paddsw m1, m1 ; r1 = H. + H. 254 paddsw m1, m2 ; r1 = R1 = A.. + H. 255 psraw m2, 4 ; r2 = NR2 256 psubsw m4, m7 ; r4 = E. = E - G 257 psraw m1, 4 ; r1 = NR2 258 movq m3, I(2) ; r3 = D. 259 paddsw m7, m7 ; r7 = G + G 260 movq I(2), m2 ; store NR2 at I2 261 paddsw m7, m4 ; r7 = G. = E + G 262 movq I(1), m1 ; store NR1 at I1 263 psubsw m4, m3 ; r4 = R4 = E. - D. 264 paddsw m4, OC_8 ; adjust R4 (and R3) for shift 265 paddsw m3, m3 ; r3 = D. + D. 266 paddsw m3, m4 ; r3 = R3 = E. + D. 267 psraw m4, 4 ; r4 = NR4 268 psubsw m6, m5 ; r6 = R6 = F. - B.. 269 psraw m3, 4 ; r3 = NR3 270 paddsw m6, OC_8 ; adjust R6 (and R5) for shift 271 paddsw m5, m5 ; r5 = B.. + B.. 272 paddsw m5, m6 ; r5 = R5 = F. + B.. 273 psraw m6, 4 ; r6 = NR6 274 movq J(4), m4 ; store NR4 at J4 275 psraw m5, 4 ; r5 = NR5 276 movq I(3), m3 ; store NR3 at I3 277 psubsw m7, m0 ; r7 = R7 = G. - C. 278 paddsw m7, OC_8 ; adjust R7 (and R0) for shift 279 paddsw m0, m0 ; r0 = C. + C. 280 paddsw m0, m7 ; r0 = R0 = G. + C. 281 psraw m7, 4 ; r7 = NR7 282 movq J(6), m6 ; store NR6 at J6 283 psraw m0, 4 ; r0 = NR0 284 movq J(5), m5 ; store NR5 at J5 285 movq J(7), m7 ; store NR7 at J7 286 movq I(0), m0 ; store NR0 at I0 287%endmacro 288 289; Following macro does two 4x4 transposes in place. 290; 291; At entry (we assume): 292; 293; r0 = a3 a2 a1 a0 294; I(1) = b3 b2 b1 b0 295; r2 = c3 c2 c1 c0 296; r3 = d3 d2 d1 d0 297; 298; r4 = e3 e2 e1 e0 299; r5 = f3 f2 f1 f0 300; r6 = g3 g2 g1 g0 301; r7 = h3 h2 h1 h0 302; 303; At exit, we have: 304; 305; I(0) = d0 c0 b0 a0 306; I(1) = d1 c1 b1 a1 307; I(2) = d2 c2 b2 a2 308; I(3) = d3 c3 b3 a3 309; 310; J(4) = h0 g0 f0 e0 311; J(5) = h1 g1 f1 e1 312; J(6) = h2 g2 f2 e2 313; J(7) = h3 g3 f3 e3 314; 315; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. 316; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. 317; 318; Since r1 is free at entry, we calculate the Js first. 319%macro Transpose 0 320 movq m1, m4 ; r1 = e3 e2 e1 e0 321 punpcklwd m4, m5 ; r4 = f1 e1 f0 e0 322 movq I(0), m0 ; save a3 a2 a1 a0 323 punpckhwd m1, m5 ; r1 = f3 e3 f2 e2 324 movq m0, m6 ; r0 = g3 g2 g1 g0 325 punpcklwd m6, m7 ; r6 = h1 g1 h0 g0 326 movq m5, m4 ; r5 = f1 e1 f0 e0 327 punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4 328 punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5 329 movq m6, m1 ; r6 = f3 e3 f2 e2 330 movq J(4), m4 331 punpckhwd m0, m7 ; r0 = h3 g3 h2 g2 332 movq J(5), m5 333 punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7 334 movq m4, I(0) ; r4 = a3 a2 a1 a0 335 punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6 336 movq m5, I(1) ; r5 = b3 b2 b1 b0 337 movq m0, m4 ; r0 = a3 a2 a1 a0 338 movq J(7), m6 339 punpcklwd m0, m5 ; r0 = b1 a1 b0 a0 340 movq J(6), m1 341 punpckhwd m4, m5 ; r4 = b3 a3 b2 a2 342 movq m5, m2 ; r5 = c3 c2 c1 c0 343 punpcklwd m2, m3 ; r2 = d1 c1 d0 c0 344 movq m1, m0 ; r1 = b1 a1 b0 a0 345 punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0 346 punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1 347 movq m2, m4 ; r2 = b3 a3 b2 a2 348 movq I(0), m0 349 punpckhwd m5, m3 ; r5 = d3 c3 d2 c2 350 movq I(1), m1 351 punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3 352 punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2 353 movq I(3), m4 354 movq I(2), m2 355%endmacro 356 357%macro VP3_1D_IDCT_SSE2 0 358 movdqa m2, I(3) ; xmm2 = i3 359 movdqa m6, C(3) ; xmm6 = c3 360 movdqa m4, m2 ; xmm4 = i3 361 movdqa m7, I(5) ; xmm7 = i5 362 pmulhw m4, m6 ; xmm4 = c3 * i3 - i3 363 movdqa m1, C(5) ; xmm1 = c5 364 pmulhw m6, m7 ; xmm6 = c3 * i5 - i5 365 movdqa m5, m1 ; xmm5 = c5 366 pmulhw m1, m2 ; xmm1 = c5 * i3 - i3 367 movdqa m3, I(1) ; xmm3 = i1 368 pmulhw m5, m7 ; xmm5 = c5 * i5 - i5 369 movdqa m0, C(1) ; xmm0 = c1 370 paddw m4, m2 ; xmm4 = c3 * i3 371 paddw m6, m7 ; xmm6 = c3 * i5 372 paddw m2, m1 ; xmm2 = c5 * i3 373 movdqa m1, I(7) ; xmm1 = i7 374 paddw m7, m5 ; xmm7 = c5 * i5 375 movdqa m5, m0 ; xmm5 = c1 376 pmulhw m0, m3 ; xmm0 = c1 * i1 - i1 377 paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C 378 pmulhw m5, m1 ; xmm5 = c1 * i7 - i7 379 movdqa m7, C(7) ; xmm7 = c7 380 psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D 381 paddw m0, m3 ; xmm0 = c1 * i1 382 pmulhw m3, m7 ; xmm3 = c7 * i1 383 movdqa m2, I(2) ; xmm2 = i2 384 pmulhw m7, m1 ; xmm7 = c7 * i7 385 paddw m5, m1 ; xmm5 = c1 * i7 386 movdqa m1, m2 ; xmm1 = i2 387 pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2 388 psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B 389 movdqa m5, I(6) ; xmm5 = i6 390 paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A 391 movdqa m7, m5 ; xmm7 = i6 392 psubsw m0, m4 ; xmm0 = A - C 393 pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6 394 paddw m2, m1 ; xmm2 = i2 * c2 395 pmulhw m1, C(6) ; xmm1 = c6 * i2 396 paddsw m4, m4 ; xmm4 = C + C 397 paddsw m4, m0 ; xmm4 = A + C = C. 398 psubsw m3, m6 ; xmm3 = B - D 399 paddw m5, m7 ; xmm5 = c2 * i6 400 paddsw m6, m6 ; xmm6 = D + D 401 pmulhw m7, C(6) ; xmm7 = c6 * i6 402 paddsw m6, m3 ; xmm6 = B + D = D. 403 movdqa I(1), m4 ; Save C. at I(1) 404 psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H 405 movdqa m4, C(4) ; xmm4 = C4 406 movdqa m5, m3 ; xmm5 = B - D 407 pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D ) 408 paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G 409 movdqa I(2), m6 ; save D. at I(2) 410 movdqa m2, m0 ; xmm2 = A - C 411 movdqa m6, I(0) ; xmm6 = i0 412 pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A. 413 paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B. 414 movdqa m3, I(4) ; xmm3 = i4 415 psubsw m5, m1 ; xmm5 = B. - H = B.. 416 paddw m2, m0 ; xmm2 = c4 * ( A - C) = A. 417 psubsw m6, m3 ; xmm6 = i0 - i4 418 movdqa m0, m6 ; xmm0 = i0 - i4 419 pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F 420 paddsw m3, m3 ; xmm3 = i4 + i4 421 paddsw m1, m1 ; xmm1 = H + H 422 paddsw m3, m0 ; xmm3 = i0 + i4 423 paddsw m1, m5 ; xmm1 = B. + H = H. 424 pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 ) 425 paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 ) 426 psubsw m6, m2 ; xmm6 = F - A. = F. 427 paddsw m2, m2 ; xmm2 = A. + A. 428 movdqa m0, I(1) ; Load C. from I(1) 429 paddsw m2, m6 ; xmm2 = F + A. = A.. 430 paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3 431 psubsw m2, m1 ; xmm2 = A.. - H. = R2 432 ADD(m2) ; Adjust R2 and R1 before shifting 433 paddsw m1, m1 ; xmm1 = H. + H. 434 paddsw m1, m2 ; xmm1 = A.. + H. = R1 435 SHIFT(m2) ; xmm2 = op2 436 psubsw m4, m7 ; xmm4 = E - G = E. 437 SHIFT(m1) ; xmm1 = op1 438 movdqa m3, I(2) ; Load D. from I(2) 439 paddsw m7, m7 ; xmm7 = G + G 440 paddsw m7, m4 ; xmm7 = E + G = G. 441 psubsw m4, m3 ; xmm4 = E. - D. = R4 442 ADD(m4) ; Adjust R4 and R3 before shifting 443 paddsw m3, m3 ; xmm3 = D. + D. 444 paddsw m3, m4 ; xmm3 = E. + D. = R3 445 SHIFT(m4) ; xmm4 = op4 446 psubsw m6, m5 ; xmm6 = F. - B..= R6 447 SHIFT(m3) ; xmm3 = op3 448 ADD(m6) ; Adjust R6 and R5 before shifting 449 paddsw m5, m5 ; xmm5 = B.. + B.. 450 paddsw m5, m6 ; xmm5 = F. + B.. = R5 451 SHIFT(m6) ; xmm6 = op6 452 SHIFT(m5) ; xmm5 = op5 453 psubsw m7, m0 ; xmm7 = G. - C. = R7 454 ADD(m7) ; Adjust R7 and R0 before shifting 455 paddsw m0, m0 ; xmm0 = C. + C. 456 paddsw m0, m7 ; xmm0 = G. + C. 457 SHIFT(m7) ; xmm7 = op7 458 SHIFT(m0) ; xmm0 = op0 459%endmacro 460 461%macro PUT_BLOCK 8 462 movdqa O(0), m%1 463 movdqa O(1), m%2 464 movdqa O(2), m%3 465 movdqa O(3), m%4 466 movdqa O(4), m%5 467 movdqa O(5), m%6 468 movdqa O(6), m%7 469 movdqa O(7), m%8 470%endmacro 471 472%macro VP3_IDCT 1 473%if mmsize == 16 474%define I(x) [%1+16*x] 475%define O(x) [%1+16*x] 476%define C(x) [vp3_idct_data+16*(x-1)] 477%define SHIFT(x) 478%define ADD(x) 479 VP3_1D_IDCT_SSE2 480%if ARCH_X86_64 481 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 482%else 483 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16] 484%endif 485 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 486 487%define SHIFT(x) psraw x, 4 488%define ADD(x) paddsw x, [pw_8] 489 VP3_1D_IDCT_SSE2 490 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 491%else ; mmsize == 8 492 ; eax = quantized input 493 ; ebx = dequantizer matrix 494 ; ecx = IDCT constants 495 ; M(I) = ecx + MaskOffset(0) + I * 8 496 ; C(I) = ecx + CosineOffset(32) + (I-1) * 8 497 ; edx = output 498 ; r0..r7 = mm0..mm7 499%define OC_8 [pw_8] 500%define C(x) [vp3_idct_data+16*(x-1)] 501 502 ; at this point, function has completed dequantization + dezigzag + 503 ; partial transposition; now do the idct itself 504%define I(x) [%1+16*x] 505%define J(x) [%1+16*x] 506 RowIDCT 507 Transpose 508 509%define I(x) [%1+16*x+8] 510%define J(x) [%1+16*x+8] 511 RowIDCT 512 Transpose 513 514%define I(x) [%1+16* x] 515%define J(x) [%1+16*(x-4)+8] 516 ColumnIDCT 517 518%define I(x) [%1+16* x +64] 519%define J(x) [%1+16*(x-4)+72] 520 ColumnIDCT 521%endif ; mmsize == 16/8 522%endmacro 523 524%macro vp3_idct_funcs 0 525cglobal vp3_idct_put, 3, 4, 9 526 VP3_IDCT r2 527 528 movsxdifnidn r1, r1d 529 mova m4, [pb_80] 530 lea r3, [r1*3] 531%assign %%i 0 532%rep 16/mmsize 533 mova m0, [r2+mmsize*0+%%i] 534 mova m1, [r2+mmsize*2+%%i] 535 mova m2, [r2+mmsize*4+%%i] 536 mova m3, [r2+mmsize*6+%%i] 537%if mmsize == 8 538 packsswb m0, [r2+mmsize*8+%%i] 539 packsswb m1, [r2+mmsize*10+%%i] 540 packsswb m2, [r2+mmsize*12+%%i] 541 packsswb m3, [r2+mmsize*14+%%i] 542%else 543 packsswb m0, [r2+mmsize*1+%%i] 544 packsswb m1, [r2+mmsize*3+%%i] 545 packsswb m2, [r2+mmsize*5+%%i] 546 packsswb m3, [r2+mmsize*7+%%i] 547%endif 548 paddb m0, m4 549 paddb m1, m4 550 paddb m2, m4 551 paddb m3, m4 552 movq [r0 ], m0 553%if mmsize == 8 554 movq [r0+r1 ], m1 555 movq [r0+r1*2], m2 556 movq [r0+r3 ], m3 557%else 558 movhps [r0+r1 ], m0 559 movq [r0+r1*2], m1 560 movhps [r0+r3 ], m1 561%endif 562%if %%i == 0 563 lea r0, [r0+r1*4] 564%endif 565%if mmsize == 16 566 movq [r0 ], m2 567 movhps [r0+r1 ], m2 568 movq [r0+r1*2], m3 569 movhps [r0+r3 ], m3 570%endif 571%assign %%i %%i+8 572%endrep 573 574 pxor m0, m0 575%assign %%offset 0 576%rep 128/mmsize 577 mova [r2+%%offset], m0 578%assign %%offset %%offset+mmsize 579%endrep 580 RET 581 582cglobal vp3_idct_add, 3, 4, 9 583 VP3_IDCT r2 584 585 movsxdifnidn r1, r1d 586 lea r3, [r1*3] 587 pxor m4, m4 588%if mmsize == 16 589%assign %%i 0 590%rep 2 591 movq m0, [r0] 592 movq m1, [r0+r1] 593 movq m2, [r0+r1*2] 594 movq m3, [r0+r3] 595 punpcklbw m0, m4 596 punpcklbw m1, m4 597 punpcklbw m2, m4 598 punpcklbw m3, m4 599 paddsw m0, [r2+ 0+%%i] 600 paddsw m1, [r2+16+%%i] 601 paddsw m2, [r2+32+%%i] 602 paddsw m3, [r2+48+%%i] 603 packuswb m0, m1 604 packuswb m2, m3 605 movq [r0 ], m0 606 movhps [r0+r1 ], m0 607 movq [r0+r1*2], m2 608 movhps [r0+r3 ], m2 609%if %%i == 0 610 lea r0, [r0+r1*4] 611%endif 612%assign %%i %%i+64 613%endrep 614%else 615%assign %%i 0 616%rep 2 617 movq m0, [r0] 618 movq m1, [r0+r1] 619 movq m2, [r0+r1*2] 620 movq m3, [r0+r3] 621 movq m5, m0 622 movq m6, m1 623 movq m7, m2 624 punpcklbw m0, m4 625 punpcklbw m1, m4 626 punpcklbw m2, m4 627 punpckhbw m5, m4 628 punpckhbw m6, m4 629 punpckhbw m7, m4 630 paddsw m0, [r2+ 0+%%i] 631 paddsw m1, [r2+16+%%i] 632 paddsw m2, [r2+32+%%i] 633 paddsw m5, [r2+64+%%i] 634 paddsw m6, [r2+80+%%i] 635 paddsw m7, [r2+96+%%i] 636 packuswb m0, m5 637 movq m5, m3 638 punpcklbw m3, m4 639 punpckhbw m5, m4 640 packuswb m1, m6 641 paddsw m3, [r2+48+%%i] 642 paddsw m5, [r2+112+%%i] 643 packuswb m2, m7 644 packuswb m3, m5 645 movq [r0 ], m0 646 movq [r0+r1 ], m1 647 movq [r0+r1*2], m2 648 movq [r0+r3 ], m3 649%if %%i == 0 650 lea r0, [r0+r1*4] 651%endif 652%assign %%i %%i+8 653%endrep 654%endif 655%assign %%i 0 656%rep 128/mmsize 657 mova [r2+%%i], m4 658%assign %%i %%i+mmsize 659%endrep 660 RET 661%endmacro 662 663%if ARCH_X86_32 664INIT_MMX mmx 665vp3_idct_funcs 666%endif 667 668INIT_XMM sse2 669vp3_idct_funcs 670 671%macro DC_ADD 0 672 movq m2, [r0 ] 673 movq m3, [r0+r1 ] 674 paddusb m2, m0 675 movq m4, [r0+r1*2] 676 paddusb m3, m0 677 movq m5, [r0+r2 ] 678 paddusb m4, m0 679 paddusb m5, m0 680 psubusb m2, m1 681 psubusb m3, m1 682 movq [r0 ], m2 683 psubusb m4, m1 684 movq [r0+r1 ], m3 685 psubusb m5, m1 686 movq [r0+r1*2], m4 687 movq [r0+r2 ], m5 688%endmacro 689 690INIT_MMX mmxext 691cglobal vp3_idct_dc_add, 3, 4 692%if ARCH_X86_64 693 movsxd r1, r1d 694%endif 695 movsx r3, word [r2] 696 mov word [r2], 0 697 lea r2, [r1*3] 698 add r3, 15 699 sar r3, 5 700 movd m0, r3d 701 pshufw m0, m0, 0x0 702 pxor m1, m1 703 psubw m1, m0 704 packuswb m0, m0 705 packuswb m1, m1 706 DC_ADD 707 lea r0, [r0+r1*4] 708 DC_ADD 709 RET 710