1/*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#ifdef DEBUG
22#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
23#else
24#define ASSERT_ALIGNED(ptr) ;
25#endif
26
27/* this code assume that stride % 16 == 0 */
28
29#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
30        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
31        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
32\
33        psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
34        psum = vec_mladd(vB, vsrc1ssH, psum);\
35        psum = vec_mladd(vC, vsrc2ssH, psum);\
36        psum = vec_mladd(vD, vsrc3ssH, psum);\
37        psum = BIAS2(psum);\
38        psum = vec_sr(psum, v6us);\
39\
40        vdst = vec_ld(0, dst);\
41        ppsum = (vec_u8)vec_pack(psum, psum);\
42        vfdst = vec_perm(vdst, ppsum, fperm);\
43\
44        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
45\
46        vec_st(fsum, 0, dst);\
47\
48        vsrc0ssH = vsrc2ssH;\
49        vsrc1ssH = vsrc3ssH;\
50\
51        dst += stride;\
52        src += stride;
53
54#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
55\
56        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
57        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
58\
59        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
60        psum = vec_mladd(vE, vsrc1ssH, psum);\
61        psum = vec_sr(psum, v6us);\
62\
63        vdst = vec_ld(0, dst);\
64        ppsum = (vec_u8)vec_pack(psum, psum);\
65        vfdst = vec_perm(vdst, ppsum, fperm);\
66\
67        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
68\
69        vec_st(fsum, 0, dst);\
70\
71        dst += stride;\
72        src += stride;
73
74#define noop(a) a
75#define add28(a) vec_add(v28ss, a)
76
77#ifdef PREFIX_h264_chroma_mc8_altivec
78static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
79                                    int stride, int h, int x, int y) {
80    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
81                        {((8 - x) * (8 - y)),
82                         ((    x) * (8 - y)),
83                         ((8 - x) * (    y)),
84                         ((    x) * (    y))};
85    register int i;
86    vec_u8 fperm;
87    const vec_s32 vABCD = vec_ld(0, ABCD);
88    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
89    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
90    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
91    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
92    LOAD_ZERO;
93    const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
94    const vec_u16 v6us = vec_splat_u16(6);
95    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
96    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
97
98    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
99    vec_u8 vsrc0uc, vsrc1uc;
100    vec_s16 vsrc0ssH, vsrc1ssH;
101    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
102    vec_s16 vsrc2ssH, vsrc3ssH, psum;
103    vec_u8 vdst, ppsum, vfdst, fsum;
104
105    if (((unsigned long)dst) % 16 == 0) {
106        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
107                         0x14, 0x15, 0x16, 0x17,
108                         0x08, 0x09, 0x0A, 0x0B,
109                         0x0C, 0x0D, 0x0E, 0x0F};
110    } else {
111        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
112                         0x04, 0x05, 0x06, 0x07,
113                         0x18, 0x19, 0x1A, 0x1B,
114                         0x1C, 0x1D, 0x1E, 0x1F};
115    }
116
117    vsrcAuc = vec_ld(0, src);
118
119    if (loadSecond)
120        vsrcBuc = vec_ld(16, src);
121    vsrcperm0 = vec_lvsl(0, src);
122    vsrcperm1 = vec_lvsl(1, src);
123
124    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
125    if (reallyBadAlign)
126        vsrc1uc = vsrcBuc;
127    else
128        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
129
130    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
131    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
132
133    if (ABCD[3]) {
134        if (!loadSecond) {// -> !reallyBadAlign
135            for (i = 0 ; i < h ; i++) {
136                vsrcCuc = vec_ld(stride + 0, src);
137                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
138                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
139
140                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
141            }
142        } else {
143            vec_u8 vsrcDuc;
144            for (i = 0 ; i < h ; i++) {
145                vsrcCuc = vec_ld(stride + 0, src);
146                vsrcDuc = vec_ld(stride + 16, src);
147                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
148                if (reallyBadAlign)
149                    vsrc3uc = vsrcDuc;
150                else
151                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
152
153                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
154            }
155        }
156    } else {
157        const vec_s16 vE = vec_add(vB, vC);
158        if (ABCD[2]) { // x == 0 B == 0
159            if (!loadSecond) {// -> !reallyBadAlign
160                for (i = 0 ; i < h ; i++) {
161                    vsrcCuc = vec_ld(stride + 0, src);
162                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
163                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
164
165                    vsrc0uc = vsrc1uc;
166                }
167            } else {
168                vec_u8 vsrcDuc;
169                for (i = 0 ; i < h ; i++) {
170                    vsrcCuc = vec_ld(stride + 0, src);
171                    vsrcDuc = vec_ld(stride + 15, src);
172                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
173                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
174
175                    vsrc0uc = vsrc1uc;
176                }
177            }
178        } else { // y == 0 C == 0
179            if (!loadSecond) {// -> !reallyBadAlign
180                for (i = 0 ; i < h ; i++) {
181                    vsrcCuc = vec_ld(0, src);
182                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
183                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
184
185                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
186                }
187            } else {
188                vec_u8 vsrcDuc;
189                for (i = 0 ; i < h ; i++) {
190                    vsrcCuc = vec_ld(0, src);
191                    vsrcDuc = vec_ld(15, src);
192                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
193                    if (reallyBadAlign)
194                        vsrc1uc = vsrcDuc;
195                    else
196                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
197
198                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
199                }
200            }
201        }
202    }
203}
204#endif
205
206/* this code assume that stride % 16 == 0 */
207#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
208static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
209   DECLARE_ALIGNED(16, signed int, ABCD)[4] =
210                        {((8 - x) * (8 - y)),
211                         ((    x) * (8 - y)),
212                         ((8 - x) * (    y)),
213                         ((    x) * (    y))};
214    register int i;
215    vec_u8 fperm;
216    const vec_s32 vABCD = vec_ld(0, ABCD);
217    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
218    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
219    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
220    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
221    LOAD_ZERO;
222    const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
223    const vec_u16 v6us  = vec_splat_u16(6);
224    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
225    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
226
227    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
228    vec_u8 vsrc0uc, vsrc1uc;
229    vec_s16 vsrc0ssH, vsrc1ssH;
230    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
231    vec_s16 vsrc2ssH, vsrc3ssH, psum;
232    vec_u8 vdst, ppsum, vfdst, fsum;
233
234    if (((unsigned long)dst) % 16 == 0) {
235        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
236                         0x14, 0x15, 0x16, 0x17,
237                         0x08, 0x09, 0x0A, 0x0B,
238                         0x0C, 0x0D, 0x0E, 0x0F};
239    } else {
240        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
241                         0x04, 0x05, 0x06, 0x07,
242                         0x18, 0x19, 0x1A, 0x1B,
243                         0x1C, 0x1D, 0x1E, 0x1F};
244    }
245
246    vsrcAuc = vec_ld(0, src);
247
248    if (loadSecond)
249        vsrcBuc = vec_ld(16, src);
250    vsrcperm0 = vec_lvsl(0, src);
251    vsrcperm1 = vec_lvsl(1, src);
252
253    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
254    if (reallyBadAlign)
255        vsrc1uc = vsrcBuc;
256    else
257        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
258
259    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
260    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
261
262    if (!loadSecond) {// -> !reallyBadAlign
263        for (i = 0 ; i < h ; i++) {
264
265
266            vsrcCuc = vec_ld(stride + 0, src);
267
268            vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
269            vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
270
271            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
272        }
273    } else {
274        vec_u8 vsrcDuc;
275        for (i = 0 ; i < h ; i++) {
276            vsrcCuc = vec_ld(stride + 0, src);
277            vsrcDuc = vec_ld(stride + 16, src);
278
279            vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
280            if (reallyBadAlign)
281                vsrc3uc = vsrcDuc;
282            else
283                vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
284
285            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
286        }
287    }
288}
289#endif
290
291#undef noop
292#undef add28
293#undef CHROMA_MC8_ALTIVEC_CORE
294
295/* this code assume stride % 16 == 0 */
296#ifdef PREFIX_h264_qpel16_h_lowpass_altivec
297static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
298    register int i;
299
300    LOAD_ZERO;
301    const vec_u8 permM2 = vec_lvsl(-2, src);
302    const vec_u8 permM1 = vec_lvsl(-1, src);
303    const vec_u8 permP0 = vec_lvsl(+0, src);
304    const vec_u8 permP1 = vec_lvsl(+1, src);
305    const vec_u8 permP2 = vec_lvsl(+2, src);
306    const vec_u8 permP3 = vec_lvsl(+3, src);
307    const vec_s16 v5ss = vec_splat_s16(5);
308    const vec_u16 v5us = vec_splat_u16(5);
309    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
310    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
311
312    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
313
314    register int align = ((((unsigned long)src) - 2) % 16);
315
316    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
317              srcP2A, srcP2B, srcP3A, srcP3B,
318              srcM1A, srcM1B, srcM2A, srcM2B,
319              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
320              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
321              psumA, psumB, sumA, sumB;
322
323    vec_u8 sum, vdst, fsum;
324
325    for (i = 0 ; i < 16 ; i ++) {
326        vec_u8 srcR1 = vec_ld(-2, src);
327        vec_u8 srcR2 = vec_ld(14, src);
328
329        switch (align) {
330        default: {
331            srcM2 = vec_perm(srcR1, srcR2, permM2);
332            srcM1 = vec_perm(srcR1, srcR2, permM1);
333            srcP0 = vec_perm(srcR1, srcR2, permP0);
334            srcP1 = vec_perm(srcR1, srcR2, permP1);
335            srcP2 = vec_perm(srcR1, srcR2, permP2);
336            srcP3 = vec_perm(srcR1, srcR2, permP3);
337        } break;
338        case 11: {
339            srcM2 = vec_perm(srcR1, srcR2, permM2);
340            srcM1 = vec_perm(srcR1, srcR2, permM1);
341            srcP0 = vec_perm(srcR1, srcR2, permP0);
342            srcP1 = vec_perm(srcR1, srcR2, permP1);
343            srcP2 = vec_perm(srcR1, srcR2, permP2);
344            srcP3 = srcR2;
345        } break;
346        case 12: {
347            vec_u8 srcR3 = vec_ld(30, src);
348            srcM2 = vec_perm(srcR1, srcR2, permM2);
349            srcM1 = vec_perm(srcR1, srcR2, permM1);
350            srcP0 = vec_perm(srcR1, srcR2, permP0);
351            srcP1 = vec_perm(srcR1, srcR2, permP1);
352            srcP2 = srcR2;
353            srcP3 = vec_perm(srcR2, srcR3, permP3);
354        } break;
355        case 13: {
356            vec_u8 srcR3 = vec_ld(30, src);
357            srcM2 = vec_perm(srcR1, srcR2, permM2);
358            srcM1 = vec_perm(srcR1, srcR2, permM1);
359            srcP0 = vec_perm(srcR1, srcR2, permP0);
360            srcP1 = srcR2;
361            srcP2 = vec_perm(srcR2, srcR3, permP2);
362            srcP3 = vec_perm(srcR2, srcR3, permP3);
363        } break;
364        case 14: {
365            vec_u8 srcR3 = vec_ld(30, src);
366            srcM2 = vec_perm(srcR1, srcR2, permM2);
367            srcM1 = vec_perm(srcR1, srcR2, permM1);
368            srcP0 = srcR2;
369            srcP1 = vec_perm(srcR2, srcR3, permP1);
370            srcP2 = vec_perm(srcR2, srcR3, permP2);
371            srcP3 = vec_perm(srcR2, srcR3, permP3);
372        } break;
373        case 15: {
374            vec_u8 srcR3 = vec_ld(30, src);
375            srcM2 = vec_perm(srcR1, srcR2, permM2);
376            srcM1 = srcR2;
377            srcP0 = vec_perm(srcR2, srcR3, permP0);
378            srcP1 = vec_perm(srcR2, srcR3, permP1);
379            srcP2 = vec_perm(srcR2, srcR3, permP2);
380            srcP3 = vec_perm(srcR2, srcR3, permP3);
381        } break;
382        }
383
384        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
385        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
386        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
387        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
388
389        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
390        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
391        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
392        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
393
394        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
395        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
396        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
397        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
398
399        sum1A = vec_adds(srcP0A, srcP1A);
400        sum1B = vec_adds(srcP0B, srcP1B);
401        sum2A = vec_adds(srcM1A, srcP2A);
402        sum2B = vec_adds(srcM1B, srcP2B);
403        sum3A = vec_adds(srcM2A, srcP3A);
404        sum3B = vec_adds(srcM2B, srcP3B);
405
406        pp1A = vec_mladd(sum1A, v20ss, v16ss);
407        pp1B = vec_mladd(sum1B, v20ss, v16ss);
408
409        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
410        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
411
412        pp3A = vec_add(sum3A, pp1A);
413        pp3B = vec_add(sum3B, pp1B);
414
415        psumA = vec_sub(pp3A, pp2A);
416        psumB = vec_sub(pp3B, pp2B);
417
418        sumA = vec_sra(psumA, v5us);
419        sumB = vec_sra(psumB, v5us);
420
421        sum = vec_packsu(sumA, sumB);
422
423        ASSERT_ALIGNED(dst);
424        vdst = vec_ld(0, dst);
425
426        OP_U8_ALTIVEC(fsum, sum, vdst);
427
428        vec_st(fsum, 0, dst);
429
430        src += srcStride;
431        dst += dstStride;
432    }
433}
434#endif
435
436/* this code assume stride % 16 == 0 */
437#ifdef PREFIX_h264_qpel16_v_lowpass_altivec
438static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
439    register int i;
440
441    LOAD_ZERO;
442    const vec_u8 perm = vec_lvsl(0, src);
443    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
444    const vec_u16 v5us = vec_splat_u16(5);
445    const vec_s16 v5ss = vec_splat_s16(5);
446    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
447
448    uint8_t *srcbis = src - (srcStride * 2);
449
450    const vec_u8 srcM2a = vec_ld(0, srcbis);
451    const vec_u8 srcM2b = vec_ld(16, srcbis);
452    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
453    //srcbis += srcStride;
454    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
455    const vec_u8 srcM1b = vec_ld(16, srcbis);
456    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
457    //srcbis += srcStride;
458    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
459    const vec_u8 srcP0b = vec_ld(16, srcbis);
460    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
461    //srcbis += srcStride;
462    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
463    const vec_u8 srcP1b = vec_ld(16, srcbis);
464    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
465    //srcbis += srcStride;
466    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
467    const vec_u8 srcP2b = vec_ld(16, srcbis);
468    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
469    //srcbis += srcStride;
470
471    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
472    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
473    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
474    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
475    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
476    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
477    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
478    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
479    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
480    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
481
482    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
483              psumA, psumB, sumA, sumB,
484              srcP3ssA, srcP3ssB,
485              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
486
487    vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
488
489    for (i = 0 ; i < 16 ; i++) {
490        srcP3a = vec_ld(0, srcbis += srcStride);
491        srcP3b = vec_ld(16, srcbis);
492        srcP3 = vec_perm(srcP3a, srcP3b, perm);
493        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
494        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
495        //srcbis += srcStride;
496
497        sum1A = vec_adds(srcP0ssA, srcP1ssA);
498        sum1B = vec_adds(srcP0ssB, srcP1ssB);
499        sum2A = vec_adds(srcM1ssA, srcP2ssA);
500        sum2B = vec_adds(srcM1ssB, srcP2ssB);
501        sum3A = vec_adds(srcM2ssA, srcP3ssA);
502        sum3B = vec_adds(srcM2ssB, srcP3ssB);
503
504        srcM2ssA = srcM1ssA;
505        srcM2ssB = srcM1ssB;
506        srcM1ssA = srcP0ssA;
507        srcM1ssB = srcP0ssB;
508        srcP0ssA = srcP1ssA;
509        srcP0ssB = srcP1ssB;
510        srcP1ssA = srcP2ssA;
511        srcP1ssB = srcP2ssB;
512        srcP2ssA = srcP3ssA;
513        srcP2ssB = srcP3ssB;
514
515        pp1A = vec_mladd(sum1A, v20ss, v16ss);
516        pp1B = vec_mladd(sum1B, v20ss, v16ss);
517
518        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
519        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
520
521        pp3A = vec_add(sum3A, pp1A);
522        pp3B = vec_add(sum3B, pp1B);
523
524        psumA = vec_sub(pp3A, pp2A);
525        psumB = vec_sub(pp3B, pp2B);
526
527        sumA = vec_sra(psumA, v5us);
528        sumB = vec_sra(psumB, v5us);
529
530        sum = vec_packsu(sumA, sumB);
531
532        ASSERT_ALIGNED(dst);
533        vdst = vec_ld(0, dst);
534
535        OP_U8_ALTIVEC(fsum, sum, vdst);
536
537        vec_st(fsum, 0, dst);
538
539        dst += dstStride;
540    }
541}
542#endif
543
544/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
545#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
546static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
547    register int i;
548    LOAD_ZERO;
549    const vec_u8 permM2 = vec_lvsl(-2, src);
550    const vec_u8 permM1 = vec_lvsl(-1, src);
551    const vec_u8 permP0 = vec_lvsl(+0, src);
552    const vec_u8 permP1 = vec_lvsl(+1, src);
553    const vec_u8 permP2 = vec_lvsl(+2, src);
554    const vec_u8 permP3 = vec_lvsl(+3, src);
555    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
556    const vec_u32 v10ui = vec_splat_u32(10);
557    const vec_s16 v5ss = vec_splat_s16(5);
558    const vec_s16 v1ss = vec_splat_s16(1);
559    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
560    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
561
562    register int align = ((((unsigned long)src) - 2) % 16);
563
564    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
565              srcP2A, srcP2B, srcP3A, srcP3B,
566              srcM1A, srcM1B, srcM2A, srcM2B,
567              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
568              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
569
570    const vec_u8 mperm = (const vec_u8)
571        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
572         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
573    int16_t *tmpbis = tmp;
574
575    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
576              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
577              tmpP2ssA, tmpP2ssB;
578
579    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
580              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
581              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
582              ssumAe, ssumAo, ssumBe, ssumBo;
583    vec_u8 fsum, sumv, sum, vdst;
584    vec_s16 ssume, ssumo;
585
586    src -= (2 * srcStride);
587    for (i = 0 ; i < 21 ; i ++) {
588        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
589        vec_u8 srcR1 = vec_ld(-2, src);
590        vec_u8 srcR2 = vec_ld(14, src);
591
592        switch (align) {
593        default: {
594            srcM2 = vec_perm(srcR1, srcR2, permM2);
595            srcM1 = vec_perm(srcR1, srcR2, permM1);
596            srcP0 = vec_perm(srcR1, srcR2, permP0);
597            srcP1 = vec_perm(srcR1, srcR2, permP1);
598            srcP2 = vec_perm(srcR1, srcR2, permP2);
599            srcP3 = vec_perm(srcR1, srcR2, permP3);
600        } break;
601        case 11: {
602            srcM2 = vec_perm(srcR1, srcR2, permM2);
603            srcM1 = vec_perm(srcR1, srcR2, permM1);
604            srcP0 = vec_perm(srcR1, srcR2, permP0);
605            srcP1 = vec_perm(srcR1, srcR2, permP1);
606            srcP2 = vec_perm(srcR1, srcR2, permP2);
607            srcP3 = srcR2;
608        } break;
609        case 12: {
610            vec_u8 srcR3 = vec_ld(30, src);
611            srcM2 = vec_perm(srcR1, srcR2, permM2);
612            srcM1 = vec_perm(srcR1, srcR2, permM1);
613            srcP0 = vec_perm(srcR1, srcR2, permP0);
614            srcP1 = vec_perm(srcR1, srcR2, permP1);
615            srcP2 = srcR2;
616            srcP3 = vec_perm(srcR2, srcR3, permP3);
617        } break;
618        case 13: {
619            vec_u8 srcR3 = vec_ld(30, src);
620            srcM2 = vec_perm(srcR1, srcR2, permM2);
621            srcM1 = vec_perm(srcR1, srcR2, permM1);
622            srcP0 = vec_perm(srcR1, srcR2, permP0);
623            srcP1 = srcR2;
624            srcP2 = vec_perm(srcR2, srcR3, permP2);
625            srcP3 = vec_perm(srcR2, srcR3, permP3);
626        } break;
627        case 14: {
628            vec_u8 srcR3 = vec_ld(30, src);
629            srcM2 = vec_perm(srcR1, srcR2, permM2);
630            srcM1 = vec_perm(srcR1, srcR2, permM1);
631            srcP0 = srcR2;
632            srcP1 = vec_perm(srcR2, srcR3, permP1);
633            srcP2 = vec_perm(srcR2, srcR3, permP2);
634            srcP3 = vec_perm(srcR2, srcR3, permP3);
635        } break;
636        case 15: {
637            vec_u8 srcR3 = vec_ld(30, src);
638            srcM2 = vec_perm(srcR1, srcR2, permM2);
639            srcM1 = srcR2;
640            srcP0 = vec_perm(srcR2, srcR3, permP0);
641            srcP1 = vec_perm(srcR2, srcR3, permP1);
642            srcP2 = vec_perm(srcR2, srcR3, permP2);
643            srcP3 = vec_perm(srcR2, srcR3, permP3);
644        } break;
645        }
646
647        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
648        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
649        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
650        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
651
652        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
653        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
654        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
655        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
656
657        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
658        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
659        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
660        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
661
662        sum1A = vec_adds(srcP0A, srcP1A);
663        sum1B = vec_adds(srcP0B, srcP1B);
664        sum2A = vec_adds(srcM1A, srcP2A);
665        sum2B = vec_adds(srcM1B, srcP2B);
666        sum3A = vec_adds(srcM2A, srcP3A);
667        sum3B = vec_adds(srcM2B, srcP3B);
668
669        pp1A = vec_mladd(sum1A, v20ss, sum3A);
670        pp1B = vec_mladd(sum1B, v20ss, sum3B);
671
672        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
673        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
674
675        psumA = vec_sub(pp1A, pp2A);
676        psumB = vec_sub(pp1B, pp2B);
677
678        vec_st(psumA, 0, tmp);
679        vec_st(psumB, 16, tmp);
680
681        src += srcStride;
682        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
683    }
684
685    tmpM2ssA = vec_ld(0, tmpbis);
686    tmpM2ssB = vec_ld(16, tmpbis);
687    tmpbis += tmpStride;
688    tmpM1ssA = vec_ld(0, tmpbis);
689    tmpM1ssB = vec_ld(16, tmpbis);
690    tmpbis += tmpStride;
691    tmpP0ssA = vec_ld(0, tmpbis);
692    tmpP0ssB = vec_ld(16, tmpbis);
693    tmpbis += tmpStride;
694    tmpP1ssA = vec_ld(0, tmpbis);
695    tmpP1ssB = vec_ld(16, tmpbis);
696    tmpbis += tmpStride;
697    tmpP2ssA = vec_ld(0, tmpbis);
698    tmpP2ssB = vec_ld(16, tmpbis);
699    tmpbis += tmpStride;
700
701    for (i = 0 ; i < 16 ; i++) {
702        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
703        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
704
705        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
706        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
707        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
708        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
709        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
710        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
711
712        tmpbis += tmpStride;
713
714        tmpM2ssA = tmpM1ssA;
715        tmpM2ssB = tmpM1ssB;
716        tmpM1ssA = tmpP0ssA;
717        tmpM1ssB = tmpP0ssB;
718        tmpP0ssA = tmpP1ssA;
719        tmpP0ssB = tmpP1ssB;
720        tmpP1ssA = tmpP2ssA;
721        tmpP1ssB = tmpP2ssB;
722        tmpP2ssA = tmpP3ssA;
723        tmpP2ssB = tmpP3ssB;
724
725        pp1Ae = vec_mule(sum1A, v20ss);
726        pp1Ao = vec_mulo(sum1A, v20ss);
727        pp1Be = vec_mule(sum1B, v20ss);
728        pp1Bo = vec_mulo(sum1B, v20ss);
729
730        pp2Ae = vec_mule(sum2A, v5ss);
731        pp2Ao = vec_mulo(sum2A, v5ss);
732        pp2Be = vec_mule(sum2B, v5ss);
733        pp2Bo = vec_mulo(sum2B, v5ss);
734
735        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
736        pp3Ao = vec_mulo(sum3A, v1ss);
737        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
738        pp3Bo = vec_mulo(sum3B, v1ss);
739
740        pp1cAe = vec_add(pp1Ae, v512si);
741        pp1cAo = vec_add(pp1Ao, v512si);
742        pp1cBe = vec_add(pp1Be, v512si);
743        pp1cBo = vec_add(pp1Bo, v512si);
744
745        pp32Ae = vec_sub(pp3Ae, pp2Ae);
746        pp32Ao = vec_sub(pp3Ao, pp2Ao);
747        pp32Be = vec_sub(pp3Be, pp2Be);
748        pp32Bo = vec_sub(pp3Bo, pp2Bo);
749
750        sumAe = vec_add(pp1cAe, pp32Ae);
751        sumAo = vec_add(pp1cAo, pp32Ao);
752        sumBe = vec_add(pp1cBe, pp32Be);
753        sumBo = vec_add(pp1cBo, pp32Bo);
754
755        ssumAe = vec_sra(sumAe, v10ui);
756        ssumAo = vec_sra(sumAo, v10ui);
757        ssumBe = vec_sra(sumBe, v10ui);
758        ssumBo = vec_sra(sumBo, v10ui);
759
760        ssume = vec_packs(ssumAe, ssumBe);
761        ssumo = vec_packs(ssumAo, ssumBo);
762
763        sumv = vec_packsu(ssume, ssumo);
764        sum = vec_perm(sumv, sumv, mperm);
765
766        ASSERT_ALIGNED(dst);
767        vdst = vec_ld(0, dst);
768
769        OP_U8_ALTIVEC(fsum, sum, vdst);
770
771        vec_st(fsum, 0, dst);
772
773        dst += dstStride;
774    }
775}
776#endif
777