1;******************************************************************************
2;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
3;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25pw_row_coeffs:  times 4 dw 13
26                times 4 dw 17
27                times 4 dw  7
28pd_512: times 2 dd 0x200
29pw_col_coeffs:  dw 13,  13,  13, -13
30                dw 17,   7,   7, -17
31                dw 13, -13,  13,  13
32                dw -7,  17, -17,  -7
33
34SECTION .text
35
36%macro IDCT_DC_NOROUND 1
37    imul   %1, 13*13*3
38    sar    %1, 11
39%endmacro
40
41%macro IDCT_DC_ROUND 1
42    imul   %1, 13*13
43    add    %1, 0x200
44    sar    %1, 10
45%endmacro
46
47%macro rv34_idct 1
48cglobal rv34_idct_%1, 1, 2, 0
49    movsx   r1, word [r0]
50    IDCT_DC r1
51    movd    m0, r1d
52    pshufw  m0, m0, 0
53    movq    [r0+ 0], m0
54    movq    [r0+ 8], m0
55    movq    [r0+16], m0
56    movq    [r0+24], m0
57    REP_RET
58%endmacro
59
60INIT_MMX mmxext
61%define IDCT_DC IDCT_DC_ROUND
62rv34_idct dc
63%define IDCT_DC IDCT_DC_NOROUND
64rv34_idct dc_noround
65
66; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
67INIT_MMX mmx
68cglobal rv34_idct_dc_add, 3, 3
69    ; calculate DC
70    IDCT_DC_ROUND r2
71    pxor       m1, m1
72    movd       m0, r2d
73    psubw      m1, m0
74    packuswb   m0, m0
75    packuswb   m1, m1
76    punpcklbw  m0, m0
77    punpcklbw  m1, m1
78    punpcklwd  m0, m0
79    punpcklwd  m1, m1
80
81    ; add DC
82    lea        r2, [r0+r1*2]
83    movh       m2, [r0]
84    movh       m3, [r0+r1]
85    movh       m4, [r2]
86    movh       m5, [r2+r1]
87    paddusb    m2, m0
88    paddusb    m3, m0
89    paddusb    m4, m0
90    paddusb    m5, m0
91    psubusb    m2, m1
92    psubusb    m3, m1
93    psubusb    m4, m1
94    psubusb    m5, m1
95    movh       [r0], m2
96    movh       [r0+r1], m3
97    movh       [r2], m4
98    movh       [r2+r1], m5
99    RET
100
101; Load coeffs and perform row transform
102; Output: coeffs in mm[0467], rounder in mm5
103%macro ROW_TRANSFORM  1
104    pxor        mm7, mm7
105    mova        mm0, [%1+ 0*8]
106    mova        mm1, [%1+ 1*8]
107    mova        mm2, [%1+ 2*8]
108    mova        mm3, [%1+ 3*8]
109    mova  [%1+ 0*8], mm7
110    mova  [%1+ 1*8], mm7
111    mova  [%1+ 2*8], mm7
112    mova  [%1+ 3*8], mm7
113    mova        mm4, mm0
114    mova        mm6, [pw_row_coeffs+ 0]
115    paddsw      mm0, mm2                ; b0 + b2
116    psubsw      mm4, mm2                ; b0 - b2
117    pmullw      mm0, mm6                ; *13 = z0
118    pmullw      mm4, mm6                ; *13 = z1
119    mova        mm5, mm1
120    pmullw      mm1, [pw_row_coeffs+ 8] ; b1*17
121    pmullw      mm5, [pw_row_coeffs+16] ; b1* 7
122    mova        mm7, mm3
123    pmullw      mm3, [pw_row_coeffs+ 8] ; b3*17
124    pmullw      mm7, [pw_row_coeffs+16] ; b3* 7
125    paddsw      mm1, mm7                ; z3 = b1*17 + b3* 7
126    psubsw      mm5, mm3                ; z2 = b1* 7 - b3*17
127    mova        mm7, mm0
128    mova        mm6, mm4
129    paddsw      mm0, mm1                ; z0 + z3
130    psubsw      mm7, mm1                ; z0 - z3
131    paddsw      mm4, mm5                ; z1 + z2
132    psubsw      mm6, mm5                ; z1 - z2
133    mova        mm5, [pd_512]           ; 0x200
134%endmacro
135
136; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
137%macro COL_TRANSFORM  4
138    pshufw      mm3, %2, 0xDD        ; col. 1,3,1,3
139    pshufw       %2, %2, 0x88        ; col. 0,2,0,2
140    pmaddwd      %2, %3              ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
141    pmaddwd     mm3, %4              ; 17*c1+ 7*c3 |  7*c1-17*c3 = z3 | z2
142    paddd        %2, mm5
143    pshufw      mm1,  %2, 01001110b  ;    z1 | z0
144    pshufw      mm2, mm3, 01001110b  ;    z2 | z3
145    paddd        %2, mm3             ; z0+z3 | z1+z2
146    psubd       mm1, mm2             ; z1-z2 | z0-z3
147    movd        mm3, %1
148    psrad        %2, 10
149    pxor        mm2, mm2
150    psrad       mm1, 10
151    punpcklbw   mm3, mm2
152    packssdw     %2, mm1
153    paddw        %2, mm3
154    packuswb     %2, %2
155    movd         %1, %2
156%endmacro
157INIT_MMX mmxext
158cglobal rv34_idct_add, 3,3,0, d, s, b
159    ROW_TRANSFORM       bq
160    COL_TRANSFORM     [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
161    mova               mm0, [pw_col_coeffs+ 0]
162    COL_TRANSFORM  [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
163    mova               mm4, [pw_col_coeffs+ 8]
164    lea                 dq, [dq + 2*sq]
165    COL_TRANSFORM     [dq], mm6, mm0, mm4
166    COL_TRANSFORM  [dq+sq], mm7, mm0, mm4
167    ret
168
169; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
170INIT_XMM sse4
171cglobal rv34_idct_dc_add, 3, 3, 6
172    ; load data
173    IDCT_DC_ROUND r2
174    pxor       m1, m1
175
176    ; calculate DC
177    movd       m0, r2d
178    lea        r2, [r0+r1*2]
179    movd       m2, [r0]
180    movd       m3, [r0+r1]
181    pshuflw    m0, m0, 0
182    movd       m4, [r2]
183    movd       m5, [r2+r1]
184    punpcklqdq m0, m0
185    punpckldq  m2, m3
186    punpckldq  m4, m5
187    punpcklbw  m2, m1
188    punpcklbw  m4, m1
189    paddw      m2, m0
190    paddw      m4, m0
191    packuswb   m2, m4
192    movd      [r0], m2
193    pextrd [r0+r1], m2, 1
194    pextrd    [r2], m2, 2
195    pextrd [r2+r1], m2, 3
196    RET
197