1/*
2 * Copyright (c) 2008 Loren Merritt
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21/**
22 * SSSE3 optimized version of (put|avg)_h264_chroma_mc8.
23 * H264_CHROMA_MC8_TMPL must be defined to the desired function name
24 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
25 * AVG_OP must be defined to empty for put and the identify for avg
26 */
27static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
28{
29    if(y==0 && x==0) {
30        /* no filter needed */
31        H264_CHROMA_MC8_MV0(dst, src, stride, h);
32        return;
33    }
34
35    assert(x<8 && y<8 && x>=0 && y>=0);
36
37    if(y==0 || x==0)
38    {
39        /* 1 dimensional filter only */
40        __asm__ volatile(
41            "movd %0, %%xmm7 \n\t"
42            "movq %1, %%xmm6 \n\t"
43            "pshuflw $0, %%xmm7, %%xmm7 \n\t"
44            "movlhps %%xmm6, %%xmm6 \n\t"
45            "movlhps %%xmm7, %%xmm7 \n\t"
46            :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3))
47        );
48
49        if(x) {
50            __asm__ volatile(
51                "1: \n\t"
52                "movq (%1), %%xmm0 \n\t"
53                "movq 1(%1), %%xmm1 \n\t"
54                "movq (%1,%3), %%xmm2 \n\t"
55                "movq 1(%1,%3), %%xmm3 \n\t"
56                "punpcklbw %%xmm1, %%xmm0 \n\t"
57                "punpcklbw %%xmm3, %%xmm2 \n\t"
58                "pmaddubsw %%xmm7, %%xmm0 \n\t"
59                "pmaddubsw %%xmm7, %%xmm2 \n\t"
60         AVG_OP("movq (%0), %%xmm4 \n\t")
61         AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
62                "paddw %%xmm6, %%xmm0 \n\t"
63                "paddw %%xmm6, %%xmm2 \n\t"
64                "psrlw $3, %%xmm0 \n\t"
65                "psrlw $3, %%xmm2 \n\t"
66                "packuswb %%xmm2, %%xmm0 \n\t"
67         AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
68                "movq %%xmm0, (%0) \n\t"
69                "movhps %%xmm0, (%0,%3) \n\t"
70                "sub $2, %2 \n\t"
71                "lea (%1,%3,2), %1 \n\t"
72                "lea (%0,%3,2), %0 \n\t"
73                "jg 1b \n\t"
74                :"+r"(dst), "+r"(src), "+r"(h)
75                :"r"((x86_reg)stride)
76            );
77        } else {
78            __asm__ volatile(
79                "1: \n\t"
80                "movq (%1), %%xmm0 \n\t"
81                "movq (%1,%3), %%xmm1 \n\t"
82                "movdqa %%xmm1, %%xmm2 \n\t"
83                "movq (%1,%3,2), %%xmm3 \n\t"
84                "punpcklbw %%xmm1, %%xmm0 \n\t"
85                "punpcklbw %%xmm3, %%xmm2 \n\t"
86                "pmaddubsw %%xmm7, %%xmm0 \n\t"
87                "pmaddubsw %%xmm7, %%xmm2 \n\t"
88         AVG_OP("movq (%0), %%xmm4 \n\t")
89         AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
90                "paddw %%xmm6, %%xmm0 \n\t"
91                "paddw %%xmm6, %%xmm2 \n\t"
92                "psrlw $3, %%xmm0 \n\t"
93                "psrlw $3, %%xmm2 \n\t"
94                "packuswb %%xmm2, %%xmm0 \n\t"
95         AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
96                "movq %%xmm0, (%0) \n\t"
97                "movhps %%xmm0, (%0,%3) \n\t"
98                "sub $2, %2 \n\t"
99                "lea (%1,%3,2), %1 \n\t"
100                "lea (%0,%3,2), %0 \n\t"
101                "jg 1b \n\t"
102                :"+r"(dst), "+r"(src), "+r"(h)
103                :"r"((x86_reg)stride)
104            );
105        }
106        return;
107    }
108
109    /* general case, bilinear */
110    __asm__ volatile(
111        "movd %0, %%xmm7 \n\t"
112        "movd %1, %%xmm6 \n\t"
113        "movdqa %2, %%xmm5 \n\t"
114        "pshuflw $0, %%xmm7, %%xmm7 \n\t"
115        "pshuflw $0, %%xmm6, %%xmm6 \n\t"
116        "movlhps %%xmm7, %%xmm7 \n\t"
117        "movlhps %%xmm6, %%xmm6 \n\t"
118        :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28))
119    );
120
121    __asm__ volatile(
122        "movq (%1), %%xmm0 \n\t"
123        "movq 1(%1), %%xmm1 \n\t"
124        "punpcklbw %%xmm1, %%xmm0 \n\t"
125        "add %3, %1 \n\t"
126        "1: \n\t"
127        "movq (%1), %%xmm1 \n\t"
128        "movq 1(%1), %%xmm2 \n\t"
129        "movq (%1,%3), %%xmm3 \n\t"
130        "movq 1(%1,%3), %%xmm4 \n\t"
131        "lea (%1,%3,2), %1 \n\t"
132        "punpcklbw %%xmm2, %%xmm1 \n\t"
133        "punpcklbw %%xmm4, %%xmm3 \n\t"
134        "movdqa %%xmm1, %%xmm2 \n\t"
135        "movdqa %%xmm3, %%xmm4 \n\t"
136        "pmaddubsw %%xmm7, %%xmm0 \n\t"
137        "pmaddubsw %%xmm6, %%xmm1 \n\t"
138        "pmaddubsw %%xmm7, %%xmm2 \n\t"
139        "pmaddubsw %%xmm6, %%xmm3 \n\t"
140        "paddw %%xmm5, %%xmm0 \n\t"
141        "paddw %%xmm5, %%xmm2 \n\t"
142        "paddw %%xmm0, %%xmm1 \n\t"
143        "paddw %%xmm2, %%xmm3 \n\t"
144        "movdqa %%xmm4, %%xmm0 \n\t"
145        "psrlw $6, %%xmm1 \n\t"
146        "psrlw $6, %%xmm3 \n\t"
147 AVG_OP("movq (%0), %%xmm2 \n\t")
148 AVG_OP("movhps (%0,%3), %%xmm2 \n\t")
149        "packuswb %%xmm3, %%xmm1 \n\t"
150 AVG_OP("pavgb %%xmm2, %%xmm1 \n\t")
151        "movq %%xmm1, (%0)\n\t"
152        "movhps %%xmm1, (%0,%3)\n\t"
153        "sub $2, %2 \n\t"
154        "lea (%0,%3,2), %0 \n\t"
155        "jg 1b \n\t"
156        :"+r"(dst), "+r"(src), "+r"(h)
157        :"r"((x86_reg)stride)
158    );
159}
160
161static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
162{
163    __asm__ volatile(
164        "movd %0, %%mm7 \n\t"
165        "movd %1, %%mm6 \n\t"
166        "movq %2, %%mm5 \n\t"
167        "pshufw $0, %%mm7, %%mm7 \n\t"
168        "pshufw $0, %%mm6, %%mm6 \n\t"
169        :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32)
170    );
171
172    __asm__ volatile(
173        "movd (%1), %%mm0 \n\t"
174        "punpcklbw 1(%1), %%mm0 \n\t"
175        "add %3, %1 \n\t"
176        "1: \n\t"
177        "movd (%1), %%mm1 \n\t"
178        "movd (%1,%3), %%mm3 \n\t"
179        "punpcklbw 1(%1), %%mm1 \n\t"
180        "punpcklbw 1(%1,%3), %%mm3 \n\t"
181        "lea (%1,%3,2), %1 \n\t"
182        "movq %%mm1, %%mm2 \n\t"
183        "movq %%mm3, %%mm4 \n\t"
184        "pmaddubsw %%mm7, %%mm0 \n\t"
185        "pmaddubsw %%mm6, %%mm1 \n\t"
186        "pmaddubsw %%mm7, %%mm2 \n\t"
187        "pmaddubsw %%mm6, %%mm3 \n\t"
188        "paddw %%mm5, %%mm0 \n\t"
189        "paddw %%mm5, %%mm2 \n\t"
190        "paddw %%mm0, %%mm1 \n\t"
191        "paddw %%mm2, %%mm3 \n\t"
192        "movq %%mm4, %%mm0 \n\t"
193        "psrlw $6, %%mm1 \n\t"
194        "psrlw $6, %%mm3 \n\t"
195        "packuswb %%mm1, %%mm1 \n\t"
196        "packuswb %%mm3, %%mm3 \n\t"
197 AVG_OP("pavgb (%0), %%mm1 \n\t")
198 AVG_OP("pavgb (%0,%3), %%mm3 \n\t")
199        "movd %%mm1, (%0)\n\t"
200        "movd %%mm3, (%0,%3)\n\t"
201        "sub $2, %2 \n\t"
202        "lea (%0,%3,2), %0 \n\t"
203        "jg 1b \n\t"
204        :"+r"(dst), "+r"(src), "+r"(h)
205        :"r"((x86_reg)stride)
206    );
207}
208
209