1;******************************************************************************
2;* MMX/SSE2-optimized functions for the VP3 decoder
3;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24; MMX-optimized functions cribbed from the original VP3 source code.
25
26SECTION_RODATA
27
28vp3_idct_data: times 8 dw 64277
29               times 8 dw 60547
30               times 8 dw 54491
31               times 8 dw 46341
32               times 8 dw 36410
33               times 8 dw 25080
34               times 8 dw 12785
35
36pb_7:  times 8 db 0x07
37pb_1F: times 8 db 0x1f
38pb_81: times 8 db 0x81
39
40cextern pb_1
41cextern pb_3
42cextern pb_80
43
44cextern pw_8
45
46SECTION .text
47
48; this is off by one or two for some cases when filter_limit is greater than 63
49; in:  p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
50; out: p1 in mm4, p2 in mm3
51%macro VP3_LOOP_FILTER 0
52    movq          m7, m6
53    pand          m6, [pb_7]    ; p0&7
54    psrlw         m7, 3
55    pand          m7, [pb_1F]   ; p0>>3
56    movq          m3, m2        ; p2
57    pxor          m2, m4
58    pand          m2, [pb_1]    ; (p2^p1)&1
59    movq          m5, m2
60    paddb         m2, m2
61    paddb         m2, m5        ; 3*(p2^p1)&1
62    paddb         m2, m6        ; extra bits lost in shifts
63    pcmpeqb       m0, m0
64    pxor          m1, m0        ; 255 - p3
65    pavgb         m1, m2        ; (256 - p3 + extrabits) >> 1
66    pxor          m0, m4        ; 255 - p1
67    pavgb         m0, m3        ; (256 + p2-p1) >> 1
68    paddb         m1, [pb_3]
69    pavgb         m1, m0        ; 128+2+(   p2-p1  - p3) >> 2
70    pavgb         m1, m0        ; 128+1+(3*(p2-p1) - p3) >> 3
71    paddusb       m7, m1        ; d+128+1
72    movq          m6, [pb_81]
73    psubusb       m6, m7
74    psubusb       m7, [pb_81]
75
76    movq          m5, [r2+516]  ; flim
77    pminub        m6, m5
78    pminub        m7, m5
79    movq          m0, m6
80    movq          m1, m7
81    paddb         m6, m6
82    paddb         m7, m7
83    pminub        m6, m5
84    pminub        m7, m5
85    psubb         m6, m0
86    psubb         m7, m1
87    paddusb       m4, m7
88    psubusb       m4, m6
89    psubusb       m3, m7
90    paddusb       m3, m6
91%endmacro
92
93%macro STORE_4_WORDS 1
94    movd         r2d, %1
95    mov  [r0     -1], r2w
96    psrlq         %1, 32
97    shr           r2, 16
98    mov  [r0+r1  -1], r2w
99    movd         r2d, %1
100    mov  [r0+r1*2-1], r2w
101    shr           r2, 16
102    mov  [r0+r3  -1], r2w
103%endmacro
104
105INIT_MMX mmxext
106cglobal vp3_v_loop_filter, 3, 4
107%if ARCH_X86_64
108    movsxd        r1, r1d
109%endif
110    mov           r3, r1
111    neg           r1
112    movq          m6, [r0+r1*2]
113    movq          m4, [r0+r1  ]
114    movq          m2, [r0     ]
115    movq          m1, [r0+r3  ]
116
117    VP3_LOOP_FILTER
118
119    movq     [r0+r1], m4
120    movq     [r0   ], m3
121    RET
122
123cglobal vp3_h_loop_filter, 3, 4
124%if ARCH_X86_64
125    movsxd        r1, r1d
126%endif
127    lea           r3, [r1*3]
128
129    movd          m6, [r0     -2]
130    movd          m4, [r0+r1  -2]
131    movd          m2, [r0+r1*2-2]
132    movd          m1, [r0+r3  -2]
133    lea           r0, [r0+r1*4  ]
134    punpcklbw     m6, [r0     -2]
135    punpcklbw     m4, [r0+r1  -2]
136    punpcklbw     m2, [r0+r1*2-2]
137    punpcklbw     m1, [r0+r3  -2]
138    sub           r0, r3
139    sub           r0, r1
140
141    TRANSPOSE4x4B  6, 4, 2, 1, 0
142    VP3_LOOP_FILTER
143    SBUTTERFLY    bw, 4, 3, 5
144
145    STORE_4_WORDS m4
146    lea           r0, [r0+r1*4  ]
147    STORE_4_WORDS m3
148    RET
149
150; from original comments: The Macro does IDct on 4 1-D Dcts
151%macro BeginIDCT 0
152    movq          m2, I(3)
153    movq          m6, C(3)
154    movq          m4, m2
155    movq          m7, J(5)
156    pmulhw        m4, m6        ; r4 = c3*i3 - i3
157    movq          m1, C(5)
158    pmulhw        m6, m7        ; r6 = c3*i5 - i5
159    movq          m5, m1
160    pmulhw        m1, m2        ; r1 = c5*i3 - i3
161    movq          m3, I(1)
162    pmulhw        m5, m7        ; r5 = c5*i5 - i5
163    movq          m0, C(1)
164    paddw         m4, m2        ; r4 = c3*i3
165    paddw         m6, m7        ; r6 = c3*i5
166    paddw         m2, m1        ; r2 = c5*i3
167    movq          m1, J(7)
168    paddw         m7, m5        ; r7 = c5*i5
169    movq          m5, m0        ; r5 = c1
170    pmulhw        m0, m3        ; r0 = c1*i1 - i1
171    paddsw        m4, m7        ; r4 = C = c3*i3 + c5*i5
172    pmulhw        m5, m1        ; r5 = c1*i7 - i7
173    movq          m7, C(7)
174    psubsw        m6, m2        ; r6 = D = c3*i5 - c5*i3
175    paddw         m0, m3        ; r0 = c1*i1
176    pmulhw        m3, m7        ; r3 = c7*i1
177    movq          m2, I(2)
178    pmulhw        m7, m1        ; r7 = c7*i7
179    paddw         m5, m1        ; r5 = c1*i7
180    movq          m1, m2        ; r1 = i2
181    pmulhw        m2, C(2)      ; r2 = c2*i2 - i2
182    psubsw        m3, m5        ; r3 = B = c7*i1 - c1*i7
183    movq          m5, J(6)
184    paddsw        m0, m7        ; r0 = A = c1*i1 + c7*i7
185    movq          m7, m5        ; r7 = i6
186    psubsw        m0, m4        ; r0 = A - C
187    pmulhw        m5, C(2)      ; r5 = c2*i6 - i6
188    paddw         m2, m1        ; r2 = c2*i2
189    pmulhw        m1, C(6)      ; r1 = c6*i2
190    paddsw        m4, m4        ; r4 = C + C
191    paddsw        m4, m0        ; r4 = C. = A + C
192    psubsw        m3, m6        ; r3 = B - D
193    paddw         m5, m7        ; r5 = c2*i6
194    paddsw        m6, m6        ; r6 = D + D
195    pmulhw        m7, C(6)      ; r7 = c6*i6
196    paddsw        m6, m3        ; r6 = D. = B + D
197    movq        I(1), m4        ; save C. at I(1)
198    psubsw        m1, m5        ; r1 = H = c6*i2 - c2*i6
199    movq          m4, C(4)
200    movq          m5, m3        ; r5 = B - D
201    pmulhw        m3, m4        ; r3 = (c4 - 1) * (B - D)
202    paddsw        m7, m2        ; r3 = (c4 - 1) * (B - D)
203    movq        I(2), m6        ; save D. at I(2)
204    movq          m2, m0        ; r2 = A - C
205    movq          m6, I(0)
206    pmulhw        m0, m4        ; r0 = (c4 - 1) * (A - C)
207    paddw         m5, m3        ; r5 = B. = c4 * (B - D)
208    movq          m3, J(4)
209    psubsw        m5, m1        ; r5 = B.. = B. - H
210    paddw         m2, m0        ; r0 = A. = c4 * (A - C)
211    psubsw        m6, m3        ; r6 = i0 - i4
212    movq          m0, m6
213    pmulhw        m6, m4        ; r6 = (c4 - 1) * (i0 - i4)
214    paddsw        m3, m3        ; r3 = i4 + i4
215    paddsw        m1, m1        ; r1 = H + H
216    paddsw        m3, m0        ; r3 = i0 + i4
217    paddsw        m1, m5        ; r1 = H. = B + H
218    pmulhw        m4, m3        ; r4 = (c4 - 1) * (i0 + i4)
219    paddsw        m6, m0        ; r6 = F = c4 * (i0 - i4)
220    psubsw        m6, m2        ; r6 = F. = F - A.
221    paddsw        m2, m2        ; r2 = A. + A.
222    movq          m0, I(1)      ; r0 = C.
223    paddsw        m2, m6        ; r2 = A.. = F + A.
224    paddw         m4, m3        ; r4 = E = c4 * (i0 + i4)
225    psubsw        m2, m1        ; r2 = R2 = A.. - H.
226%endmacro
227
228; RowIDCT gets ready to transpose
229%macro RowIDCT 0
230    BeginIDCT
231    movq          m3, I(2)      ; r3 = D.
232    psubsw        m4, m7        ; r4 = E. = E - G
233    paddsw        m1, m1        ; r1 = H. + H.
234    paddsw        m7, m7        ; r7 = G + G
235    paddsw        m1, m2        ; r1 = R1 = A.. + H.
236    paddsw        m7, m4        ; r1 = R1 = A.. + H.
237    psubsw        m4, m3        ; r4 = R4 = E. - D.
238    paddsw        m3, m3
239    psubsw        m6, m5        ; r6 = R6 = F. - B..
240    paddsw        m5, m5
241    paddsw        m3, m4        ; r3 = R3 = E. + D.
242    paddsw        m5, m6        ; r5 = R5 = F. + B..
243    psubsw        m7, m0        ; r7 = R7 = G. - C.
244    paddsw        m0, m0
245    movq        I(1), m1        ; save R1
246    paddsw        m0, m7        ; r0 = R0 = G. + C.
247%endmacro
248
249; Column IDCT normalizes and stores final results
250%macro ColumnIDCT 0
251    BeginIDCT
252    paddsw        m2, OC_8      ; adjust R2 (and R1) for shift
253    paddsw        m1, m1        ; r1 = H. + H.
254    paddsw        m1, m2        ; r1 = R1 = A.. + H.
255    psraw         m2, 4         ; r2 = NR2
256    psubsw        m4, m7        ; r4 = E. = E - G
257    psraw         m1, 4         ; r1 = NR2
258    movq          m3, I(2)      ; r3 = D.
259    paddsw        m7, m7        ; r7 = G + G
260    movq        I(2), m2        ; store NR2 at I2
261    paddsw        m7, m4        ; r7 = G. = E + G
262    movq        I(1), m1        ; store NR1 at I1
263    psubsw        m4, m3        ; r4 = R4 = E. - D.
264    paddsw        m4, OC_8      ; adjust R4 (and R3) for shift
265    paddsw        m3, m3        ; r3 = D. + D.
266    paddsw        m3, m4        ; r3 = R3 = E. + D.
267    psraw         m4, 4         ; r4 = NR4
268    psubsw        m6, m5        ; r6 = R6 = F. - B..
269    psraw         m3, 4         ; r3 = NR3
270    paddsw        m6, OC_8      ; adjust R6 (and R5) for shift
271    paddsw        m5, m5        ; r5 = B.. + B..
272    paddsw        m5, m6        ; r5 = R5 = F. + B..
273    psraw         m6, 4         ; r6 = NR6
274    movq        J(4), m4        ; store NR4 at J4
275    psraw         m5, 4         ; r5 = NR5
276    movq        I(3), m3        ; store NR3 at I3
277    psubsw        m7, m0        ; r7 = R7 = G. - C.
278    paddsw        m7, OC_8      ; adjust R7 (and R0) for shift
279    paddsw        m0, m0        ; r0 = C. + C.
280    paddsw        m0, m7        ; r0 = R0 = G. + C.
281    psraw         m7, 4         ; r7 = NR7
282    movq        J(6), m6        ; store NR6 at J6
283    psraw         m0, 4         ; r0 = NR0
284    movq        J(5), m5        ; store NR5 at J5
285    movq        J(7), m7        ; store NR7 at J7
286    movq        I(0), m0        ; store NR0 at I0
287%endmacro
288
289; Following macro does two 4x4 transposes in place.
290;
291; At entry (we assume):
292;
293;   r0 = a3 a2 a1 a0
294;   I(1) = b3 b2 b1 b0
295;   r2 = c3 c2 c1 c0
296;   r3 = d3 d2 d1 d0
297;
298;   r4 = e3 e2 e1 e0
299;   r5 = f3 f2 f1 f0
300;   r6 = g3 g2 g1 g0
301;   r7 = h3 h2 h1 h0
302;
303; At exit, we have:
304;
305;   I(0) = d0 c0 b0 a0
306;   I(1) = d1 c1 b1 a1
307;   I(2) = d2 c2 b2 a2
308;   I(3) = d3 c3 b3 a3
309;
310;   J(4) = h0 g0 f0 e0
311;   J(5) = h1 g1 f1 e1
312;   J(6) = h2 g2 f2 e2
313;   J(7) = h3 g3 f3 e3
314;
315;  I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
316;  J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
317;
318;  Since r1 is free at entry, we calculate the Js first.
319%macro Transpose 0
320    movq          m1, m4        ; r1 = e3 e2 e1 e0
321    punpcklwd     m4, m5        ; r4 = f1 e1 f0 e0
322    movq        I(0), m0        ; save a3 a2 a1 a0
323    punpckhwd     m1, m5        ; r1 = f3 e3 f2 e2
324    movq          m0, m6        ; r0 = g3 g2 g1 g0
325    punpcklwd     m6, m7        ; r6 = h1 g1 h0 g0
326    movq          m5, m4        ; r5 = f1 e1 f0 e0
327    punpckldq     m4, m6        ; r4 = h0 g0 f0 e0 = R4
328    punpckhdq     m5, m6        ; r5 = h1 g1 f1 e1 = R5
329    movq          m6, m1        ; r6 = f3 e3 f2 e2
330    movq        J(4), m4
331    punpckhwd     m0, m7        ; r0 = h3 g3 h2 g2
332    movq        J(5), m5
333    punpckhdq     m6, m0        ; r6 = h3 g3 f3 e3 = R7
334    movq          m4, I(0)      ; r4 = a3 a2 a1 a0
335    punpckldq     m1, m0        ; r1 = h2 g2 f2 e2 = R6
336    movq          m5, I(1)      ; r5 = b3 b2 b1 b0
337    movq          m0, m4        ; r0 = a3 a2 a1 a0
338    movq        J(7), m6
339    punpcklwd     m0, m5        ; r0 = b1 a1 b0 a0
340    movq        J(6), m1
341    punpckhwd     m4, m5        ; r4 = b3 a3 b2 a2
342    movq          m5, m2        ; r5 = c3 c2 c1 c0
343    punpcklwd     m2, m3        ; r2 = d1 c1 d0 c0
344    movq          m1, m0        ; r1 = b1 a1 b0 a0
345    punpckldq     m0, m2        ; r0 = d0 c0 b0 a0 = R0
346    punpckhdq     m1, m2        ; r1 = d1 c1 b1 a1 = R1
347    movq          m2, m4        ; r2 = b3 a3 b2 a2
348    movq        I(0), m0
349    punpckhwd     m5, m3        ; r5 = d3 c3 d2 c2
350    movq        I(1), m1
351    punpckhdq     m4, m5        ; r4 = d3 c3 b3 a3 = R3
352    punpckldq     m2, m5        ; r2 = d2 c2 b2 a2 = R2
353    movq        I(3), m4
354    movq        I(2), m2
355%endmacro
356
357%macro VP3_1D_IDCT_SSE2 0
358    movdqa        m2, I(3)      ; xmm2 = i3
359    movdqa        m6, C(3)      ; xmm6 = c3
360    movdqa        m4, m2        ; xmm4 = i3
361    movdqa        m7, I(5)      ; xmm7 = i5
362    pmulhw        m4, m6        ; xmm4 = c3 * i3 - i3
363    movdqa        m1, C(5)      ; xmm1 = c5
364    pmulhw        m6, m7        ; xmm6 = c3 * i5 - i5
365    movdqa        m5, m1        ; xmm5 = c5
366    pmulhw        m1, m2        ; xmm1 = c5 * i3 - i3
367    movdqa        m3, I(1)      ; xmm3 = i1
368    pmulhw        m5, m7        ; xmm5 = c5 * i5 - i5
369    movdqa        m0, C(1)      ; xmm0 = c1
370    paddw         m4, m2        ; xmm4 = c3 * i3
371    paddw         m6, m7        ; xmm6 = c3 * i5
372    paddw         m2, m1        ; xmm2 = c5 * i3
373    movdqa        m1, I(7)      ; xmm1 = i7
374    paddw         m7, m5        ; xmm7 = c5 * i5
375    movdqa        m5, m0        ; xmm5 = c1
376    pmulhw        m0, m3        ; xmm0 = c1 * i1 - i1
377    paddsw        m4, m7        ; xmm4 = c3 * i3 + c5 * i5 = C
378    pmulhw        m5, m1        ; xmm5 = c1 * i7 - i7
379    movdqa        m7, C(7)      ; xmm7 = c7
380    psubsw        m6, m2        ; xmm6 = c3 * i5 - c5 * i3 = D
381    paddw         m0, m3        ; xmm0 = c1 * i1
382    pmulhw        m3, m7        ; xmm3 = c7 * i1
383    movdqa        m2, I(2)      ; xmm2 = i2
384    pmulhw        m7, m1        ; xmm7 = c7 * i7
385    paddw         m5, m1        ; xmm5 = c1 * i7
386    movdqa        m1, m2        ; xmm1 = i2
387    pmulhw        m2, C(2)      ; xmm2 = i2 * c2 -i2
388    psubsw        m3, m5        ; xmm3 = c7 * i1 - c1 * i7 = B
389    movdqa        m5, I(6)      ; xmm5 = i6
390    paddsw        m0, m7        ; xmm0 = c1 * i1 + c7 * i7 = A
391    movdqa        m7, m5        ; xmm7 = i6
392    psubsw        m0, m4        ; xmm0 = A - C
393    pmulhw        m5, C(2)      ; xmm5 = c2 * i6 - i6
394    paddw         m2, m1        ; xmm2 = i2 * c2
395    pmulhw        m1, C(6)      ; xmm1 = c6 * i2
396    paddsw        m4, m4        ; xmm4 = C + C
397    paddsw        m4, m0        ; xmm4 = A + C = C.
398    psubsw        m3, m6        ; xmm3 = B - D
399    paddw         m5, m7        ; xmm5 = c2 * i6
400    paddsw        m6, m6        ; xmm6 = D + D
401    pmulhw        m7, C(6)      ; xmm7 = c6 * i6
402    paddsw        m6, m3        ; xmm6 = B + D = D.
403    movdqa      I(1), m4        ; Save C. at I(1)
404    psubsw        m1, m5        ; xmm1 = c6 * i2 - c2 * i6 = H
405    movdqa        m4, C(4)      ; xmm4 = C4
406    movdqa        m5, m3        ; xmm5 = B - D
407    pmulhw        m3, m4        ; xmm3 = ( c4 -1 ) * ( B - D )
408    paddsw        m7, m2        ; xmm7 = c2 * i2 + c6 * i6 = G
409    movdqa      I(2), m6        ; save D. at I(2)
410    movdqa        m2, m0        ; xmm2 = A - C
411    movdqa        m6, I(0)      ; xmm6 = i0
412    pmulhw        m0, m4        ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
413    paddw         m5, m3        ; xmm5 = c4 * ( B - D ) = B.
414    movdqa        m3, I(4)      ; xmm3 = i4
415    psubsw        m5, m1        ; xmm5 = B. - H = B..
416    paddw         m2, m0        ; xmm2 = c4 * ( A - C) = A.
417    psubsw        m6, m3        ; xmm6 = i0 - i4
418    movdqa        m0, m6        ; xmm0 = i0 - i4
419    pmulhw        m6, m4        ; xmm6 = (c4 - 1) * (i0 - i4) = F
420    paddsw        m3, m3        ; xmm3 = i4 + i4
421    paddsw        m1, m1        ; xmm1 = H + H
422    paddsw        m3, m0        ; xmm3 = i0 + i4
423    paddsw        m1, m5        ; xmm1 = B. + H = H.
424    pmulhw        m4, m3        ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
425    paddw         m6, m0        ; xmm6 = c4 * ( i0 - i4 )
426    psubsw        m6, m2        ; xmm6 = F - A. = F.
427    paddsw        m2, m2        ; xmm2 = A. + A.
428    movdqa        m0, I(1)      ; Load        C. from I(1)
429    paddsw        m2, m6        ; xmm2 = F + A. = A..
430    paddw         m4, m3        ; xmm4 = c4 * ( i0 + i4 ) = 3
431    psubsw        m2, m1        ; xmm2 = A.. - H. = R2
432    ADD(m2)                     ; Adjust R2 and R1 before shifting
433    paddsw        m1, m1        ; xmm1 = H. + H.
434    paddsw        m1, m2        ; xmm1 = A.. + H. = R1
435    SHIFT(m2)                   ; xmm2 = op2
436    psubsw        m4, m7        ; xmm4 = E - G = E.
437    SHIFT(m1)                   ; xmm1 = op1
438    movdqa        m3, I(2)      ; Load D. from I(2)
439    paddsw        m7, m7        ; xmm7 = G + G
440    paddsw        m7, m4        ; xmm7 = E + G = G.
441    psubsw        m4, m3        ; xmm4 = E. - D. = R4
442    ADD(m4)                     ; Adjust R4 and R3 before shifting
443    paddsw        m3, m3        ; xmm3 = D. + D.
444    paddsw        m3, m4        ; xmm3 = E. + D. = R3
445    SHIFT(m4)                   ; xmm4 = op4
446    psubsw        m6, m5        ; xmm6 = F. - B..= R6
447    SHIFT(m3)                   ; xmm3 = op3
448    ADD(m6)                     ; Adjust R6 and R5 before shifting
449    paddsw        m5, m5        ; xmm5 = B.. + B..
450    paddsw        m5, m6        ; xmm5 = F. + B.. = R5
451    SHIFT(m6)                   ; xmm6 = op6
452    SHIFT(m5)                   ; xmm5 = op5
453    psubsw        m7, m0        ; xmm7 = G. - C. = R7
454    ADD(m7)                     ; Adjust R7 and R0 before shifting
455    paddsw        m0, m0        ; xmm0 = C. + C.
456    paddsw        m0, m7        ; xmm0 = G. + C.
457    SHIFT(m7)                   ; xmm7 = op7
458    SHIFT(m0)                   ; xmm0 = op0
459%endmacro
460
461%macro PUT_BLOCK 8
462    movdqa      O(0), m%1
463    movdqa      O(1), m%2
464    movdqa      O(2), m%3
465    movdqa      O(3), m%4
466    movdqa      O(4), m%5
467    movdqa      O(5), m%6
468    movdqa      O(6), m%7
469    movdqa      O(7), m%8
470%endmacro
471
472%macro VP3_IDCT 1
473%if mmsize == 16
474%define I(x) [%1+16*x]
475%define O(x) [%1+16*x]
476%define C(x) [vp3_idct_data+16*(x-1)]
477%define SHIFT(x)
478%define ADD(x)
479        VP3_1D_IDCT_SSE2
480%if ARCH_X86_64
481        TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
482%else
483        TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
484%endif
485        PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
486
487%define SHIFT(x) psraw  x, 4
488%define ADD(x)   paddsw x, [pw_8]
489        VP3_1D_IDCT_SSE2
490        PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
491%else ; mmsize == 8
492    ; eax = quantized input
493    ; ebx = dequantizer matrix
494    ; ecx = IDCT constants
495    ;  M(I) = ecx + MaskOffset(0) + I * 8
496    ;  C(I) = ecx + CosineOffset(32) + (I-1) * 8
497    ; edx = output
498    ; r0..r7 = mm0..mm7
499%define OC_8 [pw_8]
500%define C(x) [vp3_idct_data+16*(x-1)]
501
502    ; at this point, function has completed dequantization + dezigzag +
503    ; partial transposition; now do the idct itself
504%define I(x) [%1+16*x]
505%define J(x) [%1+16*x]
506    RowIDCT
507    Transpose
508
509%define I(x) [%1+16*x+8]
510%define J(x) [%1+16*x+8]
511    RowIDCT
512    Transpose
513
514%define I(x) [%1+16* x]
515%define J(x) [%1+16*(x-4)+8]
516    ColumnIDCT
517
518%define I(x) [%1+16* x   +64]
519%define J(x) [%1+16*(x-4)+72]
520    ColumnIDCT
521%endif ; mmsize == 16/8
522%endmacro
523
524%macro vp3_idct_funcs 0
525cglobal vp3_idct_put, 3, 4, 9
526    VP3_IDCT      r2
527
528    movsxdifnidn  r1, r1d
529    mova          m4, [pb_80]
530    lea           r3, [r1*3]
531%assign %%i 0
532%rep 16/mmsize
533    mova          m0, [r2+mmsize*0+%%i]
534    mova          m1, [r2+mmsize*2+%%i]
535    mova          m2, [r2+mmsize*4+%%i]
536    mova          m3, [r2+mmsize*6+%%i]
537%if mmsize == 8
538    packsswb      m0, [r2+mmsize*8+%%i]
539    packsswb      m1, [r2+mmsize*10+%%i]
540    packsswb      m2, [r2+mmsize*12+%%i]
541    packsswb      m3, [r2+mmsize*14+%%i]
542%else
543    packsswb      m0, [r2+mmsize*1+%%i]
544    packsswb      m1, [r2+mmsize*3+%%i]
545    packsswb      m2, [r2+mmsize*5+%%i]
546    packsswb      m3, [r2+mmsize*7+%%i]
547%endif
548    paddb         m0, m4
549    paddb         m1, m4
550    paddb         m2, m4
551    paddb         m3, m4
552    movq   [r0     ], m0
553%if mmsize == 8
554    movq   [r0+r1  ], m1
555    movq   [r0+r1*2], m2
556    movq   [r0+r3  ], m3
557%else
558    movhps [r0+r1  ], m0
559    movq   [r0+r1*2], m1
560    movhps [r0+r3  ], m1
561%endif
562%if %%i == 0
563    lea           r0, [r0+r1*4]
564%endif
565%if mmsize == 16
566    movq   [r0     ], m2
567    movhps [r0+r1  ], m2
568    movq   [r0+r1*2], m3
569    movhps [r0+r3  ], m3
570%endif
571%assign %%i %%i+8
572%endrep
573
574    pxor          m0, m0
575%assign %%offset 0
576%rep 128/mmsize
577    mova [r2+%%offset], m0
578%assign %%offset %%offset+mmsize
579%endrep
580    RET
581
582cglobal vp3_idct_add, 3, 4, 9
583    VP3_IDCT      r2
584
585    movsxdifnidn  r1, r1d
586    lea           r3, [r1*3]
587    pxor          m4, m4
588%if mmsize == 16
589%assign %%i 0
590%rep 2
591    movq          m0, [r0]
592    movq          m1, [r0+r1]
593    movq          m2, [r0+r1*2]
594    movq          m3, [r0+r3]
595    punpcklbw     m0, m4
596    punpcklbw     m1, m4
597    punpcklbw     m2, m4
598    punpcklbw     m3, m4
599    paddsw        m0, [r2+ 0+%%i]
600    paddsw        m1, [r2+16+%%i]
601    paddsw        m2, [r2+32+%%i]
602    paddsw        m3, [r2+48+%%i]
603    packuswb      m0, m1
604    packuswb      m2, m3
605    movq   [r0     ], m0
606    movhps [r0+r1  ], m0
607    movq   [r0+r1*2], m2
608    movhps [r0+r3  ], m2
609%if %%i == 0
610    lea           r0, [r0+r1*4]
611%endif
612%assign %%i %%i+64
613%endrep
614%else
615%assign %%i 0
616%rep 2
617    movq          m0, [r0]
618    movq          m1, [r0+r1]
619    movq          m2, [r0+r1*2]
620    movq          m3, [r0+r3]
621    movq          m5, m0
622    movq          m6, m1
623    movq          m7, m2
624    punpcklbw     m0, m4
625    punpcklbw     m1, m4
626    punpcklbw     m2, m4
627    punpckhbw     m5, m4
628    punpckhbw     m6, m4
629    punpckhbw     m7, m4
630    paddsw        m0, [r2+ 0+%%i]
631    paddsw        m1, [r2+16+%%i]
632    paddsw        m2, [r2+32+%%i]
633    paddsw        m5, [r2+64+%%i]
634    paddsw        m6, [r2+80+%%i]
635    paddsw        m7, [r2+96+%%i]
636    packuswb      m0, m5
637    movq          m5, m3
638    punpcklbw     m3, m4
639    punpckhbw     m5, m4
640    packuswb      m1, m6
641    paddsw        m3, [r2+48+%%i]
642    paddsw        m5, [r2+112+%%i]
643    packuswb      m2, m7
644    packuswb      m3, m5
645    movq   [r0     ], m0
646    movq   [r0+r1  ], m1
647    movq   [r0+r1*2], m2
648    movq   [r0+r3  ], m3
649%if %%i == 0
650    lea           r0, [r0+r1*4]
651%endif
652%assign %%i %%i+8
653%endrep
654%endif
655%assign %%i 0
656%rep 128/mmsize
657    mova    [r2+%%i], m4
658%assign %%i %%i+mmsize
659%endrep
660    RET
661%endmacro
662
663%if ARCH_X86_32
664INIT_MMX mmx
665vp3_idct_funcs
666%endif
667
668INIT_XMM sse2
669vp3_idct_funcs
670
671%macro DC_ADD 0
672    movq          m2, [r0     ]
673    movq          m3, [r0+r1  ]
674    paddusb       m2, m0
675    movq          m4, [r0+r1*2]
676    paddusb       m3, m0
677    movq          m5, [r0+r2  ]
678    paddusb       m4, m0
679    paddusb       m5, m0
680    psubusb       m2, m1
681    psubusb       m3, m1
682    movq   [r0     ], m2
683    psubusb       m4, m1
684    movq   [r0+r1  ], m3
685    psubusb       m5, m1
686    movq   [r0+r1*2], m4
687    movq   [r0+r2  ], m5
688%endmacro
689
690INIT_MMX mmxext
691cglobal vp3_idct_dc_add, 3, 4
692%if ARCH_X86_64
693    movsxd        r1, r1d
694%endif
695    movsx         r3, word [r2]
696    mov    word [r2], 0
697    lea           r2, [r1*3]
698    add           r3, 15
699    sar           r3, 5
700    movd          m0, r3d
701    pshufw        m0, m0, 0x0
702    pxor          m1, m1
703    psubw         m1, m0
704    packuswb      m0, m0
705    packuswb      m1, m1
706    DC_ADD
707    lea           r0, [r0+r1*4]
708    DC_ADD
709    RET
710