1/*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mem.h"
22
23/* this code assume that stride % 16 == 0 */
24
25#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
26        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
27        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
28\
29        psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
30        psum = vec_mladd(vB, vsrc1ssH, psum);\
31        psum = vec_mladd(vC, vsrc2ssH, psum);\
32        psum = vec_mladd(vD, vsrc3ssH, psum);\
33        psum = BIAS2(psum);\
34        psum = vec_sr(psum, v6us);\
35\
36        vdst = vec_ld(0, dst);\
37        ppsum = (vec_u8)vec_pack(psum, psum);\
38        vfdst = vec_perm(vdst, ppsum, fperm);\
39\
40        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
41\
42        vec_st(fsum, 0, dst);\
43\
44        vsrc0ssH = vsrc2ssH;\
45        vsrc1ssH = vsrc3ssH;\
46\
47        dst += stride;\
48        src += stride;
49
50#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
51\
52        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
53        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
54\
55        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
56        psum = vec_mladd(vE, vsrc1ssH, psum);\
57        psum = vec_sr(psum, v6us);\
58\
59        vdst = vec_ld(0, dst);\
60        ppsum = (vec_u8)vec_pack(psum, psum);\
61        vfdst = vec_perm(vdst, ppsum, fperm);\
62\
63        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
64\
65        vec_st(fsum, 0, dst);\
66\
67        dst += stride;\
68        src += stride;
69
70#define noop(a) a
71#define add28(a) vec_add(v28ss, a)
72
73#ifdef PREFIX_h264_chroma_mc8_altivec
74static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
75                                    int stride, int h, int x, int y) {
76    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
77                        {((8 - x) * (8 - y)),
78                         ((    x) * (8 - y)),
79                         ((8 - x) * (    y)),
80                         ((    x) * (    y))};
81    register int i;
82    vec_u8 fperm;
83    const vec_s32 vABCD = vec_ld(0, ABCD);
84    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
85    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
86    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
87    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
88    LOAD_ZERO;
89    const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
90    const vec_u16 v6us = vec_splat_u16(6);
91    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
92    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
93
94    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
95    vec_u8 vsrc0uc, vsrc1uc;
96    vec_s16 vsrc0ssH, vsrc1ssH;
97    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
98    vec_s16 vsrc2ssH, vsrc3ssH, psum;
99    vec_u8 vdst, ppsum, vfdst, fsum;
100
101    if (((unsigned long)dst) % 16 == 0) {
102        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
103                         0x14, 0x15, 0x16, 0x17,
104                         0x08, 0x09, 0x0A, 0x0B,
105                         0x0C, 0x0D, 0x0E, 0x0F};
106    } else {
107        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
108                         0x04, 0x05, 0x06, 0x07,
109                         0x18, 0x19, 0x1A, 0x1B,
110                         0x1C, 0x1D, 0x1E, 0x1F};
111    }
112
113    vsrcAuc = vec_ld(0, src);
114
115    if (loadSecond)
116        vsrcBuc = vec_ld(16, src);
117    vsrcperm0 = vec_lvsl(0, src);
118    vsrcperm1 = vec_lvsl(1, src);
119
120    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
121    if (reallyBadAlign)
122        vsrc1uc = vsrcBuc;
123    else
124        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
125
126    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
127    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
128
129    if (ABCD[3]) {
130        if (!loadSecond) {// -> !reallyBadAlign
131            for (i = 0 ; i < h ; i++) {
132                vsrcCuc = vec_ld(stride + 0, src);
133                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
134                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
135
136                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
137            }
138        } else {
139            vec_u8 vsrcDuc;
140            for (i = 0 ; i < h ; i++) {
141                vsrcCuc = vec_ld(stride + 0, src);
142                vsrcDuc = vec_ld(stride + 16, src);
143                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
144                if (reallyBadAlign)
145                    vsrc3uc = vsrcDuc;
146                else
147                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
148
149                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
150            }
151        }
152    } else {
153        const vec_s16 vE = vec_add(vB, vC);
154        if (ABCD[2]) { // x == 0 B == 0
155            if (!loadSecond) {// -> !reallyBadAlign
156                for (i = 0 ; i < h ; i++) {
157                    vsrcCuc = vec_ld(stride + 0, src);
158                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
159                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
160
161                    vsrc0uc = vsrc1uc;
162                }
163            } else {
164                vec_u8 vsrcDuc;
165                for (i = 0 ; i < h ; i++) {
166                    vsrcCuc = vec_ld(stride + 0, src);
167                    vsrcDuc = vec_ld(stride + 15, src);
168                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
169                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
170
171                    vsrc0uc = vsrc1uc;
172                }
173            }
174        } else { // y == 0 C == 0
175            if (!loadSecond) {// -> !reallyBadAlign
176                for (i = 0 ; i < h ; i++) {
177                    vsrcCuc = vec_ld(0, src);
178                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
179                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
180
181                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
182                }
183            } else {
184                vec_u8 vsrcDuc;
185                for (i = 0 ; i < h ; i++) {
186                    vsrcCuc = vec_ld(0, src);
187                    vsrcDuc = vec_ld(15, src);
188                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
189                    if (reallyBadAlign)
190                        vsrc1uc = vsrcDuc;
191                    else
192                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
193
194                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
195                }
196            }
197        }
198    }
199}
200#endif
201
202/* this code assume that stride % 16 == 0 */
203#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
204static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
205   DECLARE_ALIGNED(16, signed int, ABCD)[4] =
206                        {((8 - x) * (8 - y)),
207                         ((    x) * (8 - y)),
208                         ((8 - x) * (    y)),
209                         ((    x) * (    y))};
210    register int i;
211    vec_u8 fperm;
212    const vec_s32 vABCD = vec_ld(0, ABCD);
213    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
214    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
215    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
216    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
217    LOAD_ZERO;
218    const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
219    const vec_u16 v6us  = vec_splat_u16(6);
220    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
221    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
222
223    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
224    vec_u8 vsrc0uc, vsrc1uc;
225    vec_s16 vsrc0ssH, vsrc1ssH;
226    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
227    vec_s16 vsrc2ssH, vsrc3ssH, psum;
228    vec_u8 vdst, ppsum, vfdst, fsum;
229
230    if (((unsigned long)dst) % 16 == 0) {
231        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
232                         0x14, 0x15, 0x16, 0x17,
233                         0x08, 0x09, 0x0A, 0x0B,
234                         0x0C, 0x0D, 0x0E, 0x0F};
235    } else {
236        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
237                         0x04, 0x05, 0x06, 0x07,
238                         0x18, 0x19, 0x1A, 0x1B,
239                         0x1C, 0x1D, 0x1E, 0x1F};
240    }
241
242    vsrcAuc = vec_ld(0, src);
243
244    if (loadSecond)
245        vsrcBuc = vec_ld(16, src);
246    vsrcperm0 = vec_lvsl(0, src);
247    vsrcperm1 = vec_lvsl(1, src);
248
249    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
250    if (reallyBadAlign)
251        vsrc1uc = vsrcBuc;
252    else
253        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
254
255    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
256    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
257
258    if (!loadSecond) {// -> !reallyBadAlign
259        for (i = 0 ; i < h ; i++) {
260
261
262            vsrcCuc = vec_ld(stride + 0, src);
263
264            vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
265            vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
266
267            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
268        }
269    } else {
270        vec_u8 vsrcDuc;
271        for (i = 0 ; i < h ; i++) {
272            vsrcCuc = vec_ld(stride + 0, src);
273            vsrcDuc = vec_ld(stride + 16, src);
274
275            vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
276            if (reallyBadAlign)
277                vsrc3uc = vsrcDuc;
278            else
279                vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
280
281            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
282        }
283    }
284}
285#endif
286
287#undef noop
288#undef add28
289#undef CHROMA_MC8_ALTIVEC_CORE
290