1;******************************************************************************
2;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
3;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
4;*
5;* This file is part of Libav.
6;*
7;* Libav is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* Libav is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with Libav; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "x86inc.asm"
23%include "x86util.asm"
24
25SECTION .text
26
27%macro IDCT_DC_NOROUND 1
28    imul   %1, 13*13*3
29    sar    %1, 11
30%endmacro
31
32%macro IDCT_DC_ROUND 1
33    imul   %1, 13*13
34    add    %1, 0x200
35    sar    %1, 10
36%endmacro
37
38%macro rv34_idct 1
39cglobal rv34_idct_%1_mmx2, 1, 2, 0
40    movsx   r1, word [r0]
41    IDCT_DC r1
42    movd    m0, r1
43    pshufw  m0, m0, 0
44    movq    [r0+ 0], m0
45    movq    [r0+ 8], m0
46    movq    [r0+16], m0
47    movq    [r0+24], m0
48    REP_RET
49%endmacro
50
51INIT_MMX
52%define IDCT_DC IDCT_DC_ROUND
53rv34_idct dc
54%define IDCT_DC IDCT_DC_NOROUND
55rv34_idct dc_noround
56
57; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
58cglobal rv34_idct_dc_add_mmx, 3, 3
59    ; calculate DC
60    IDCT_DC_ROUND r2
61    pxor       m1, m1
62    movd       m0, r2
63    psubw      m1, m0
64    packuswb   m0, m0
65    packuswb   m1, m1
66    punpcklbw  m0, m0
67    punpcklbw  m1, m1
68    punpcklwd  m0, m0
69    punpcklwd  m1, m1
70
71    ; add DC
72    lea        r2, [r0+r1*2]
73    movh       m2, [r0]
74    movh       m3, [r0+r1]
75    movh       m4, [r2]
76    movh       m5, [r2+r1]
77    paddusb    m2, m0
78    paddusb    m3, m0
79    paddusb    m4, m0
80    paddusb    m5, m0
81    psubusb    m2, m1
82    psubusb    m3, m1
83    psubusb    m4, m1
84    psubusb    m5, m1
85    movh       [r0], m2
86    movh       [r0+r1], m3
87    movh       [r2], m4
88    movh       [r2+r1], m5
89    RET
90
91; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
92INIT_XMM
93cglobal rv34_idct_dc_add_sse4, 3, 3, 6
94    ; load data
95    IDCT_DC_ROUND r2
96    pxor       m1, m1
97
98    ; calculate DC
99    movd       m0, r2
100    lea        r2, [r0+r1*2]
101    movd       m2, [r0]
102    movd       m3, [r0+r1]
103    pshuflw    m0, m0, 0
104    movd       m4, [r2]
105    movd       m5, [r2+r1]
106    punpcklqdq m0, m0
107    punpckldq  m2, m3
108    punpckldq  m4, m5
109    punpcklbw  m2, m1
110    punpcklbw  m4, m1
111    paddw      m2, m0
112    paddw      m4, m0
113    packuswb   m2, m4
114    movd      [r0], m2
115    pextrd [r0+r1], m2, 1
116    pextrd    [r2], m2, 2
117    pextrd [r2+r1], m2, 3
118    RET
119