• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/router/ffmpeg/libavcodec/ppc/
1/*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21//#define DEBUG_ALIGNMENT
22#ifdef DEBUG_ALIGNMENT
23#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
24#else
25#define ASSERT_ALIGNED(ptr) ;
26#endif
27
28/* this code assume that stride % 16 == 0 */
29
30#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
31        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
33\
34        psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
35        psum = vec_mladd(vB, vsrc1ssH, psum);\
36        psum = vec_mladd(vC, vsrc2ssH, psum);\
37        psum = vec_mladd(vD, vsrc3ssH, psum);\
38        psum = BIAS2(psum);\
39        psum = vec_sr(psum, v6us);\
40\
41        vdst = vec_ld(0, dst);\
42        ppsum = (vec_u8)vec_pack(psum, psum);\
43        vfdst = vec_perm(vdst, ppsum, fperm);\
44\
45        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
46\
47        vec_st(fsum, 0, dst);\
48\
49        vsrc0ssH = vsrc2ssH;\
50        vsrc1ssH = vsrc3ssH;\
51\
52        dst += stride;\
53        src += stride;
54
55#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
56\
57        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
58        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
59\
60        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
61        psum = vec_mladd(vE, vsrc1ssH, psum);\
62        psum = vec_sr(psum, v6us);\
63\
64        vdst = vec_ld(0, dst);\
65        ppsum = (vec_u8)vec_pack(psum, psum);\
66        vfdst = vec_perm(vdst, ppsum, fperm);\
67\
68        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
69\
70        vec_st(fsum, 0, dst);\
71\
72        dst += stride;\
73        src += stride;
74
75#define noop(a) a
76#define add28(a) vec_add(v28ss, a)
77
78static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
79                                    int stride, int h, int x, int y) {
80  POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
81    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
82                        {((8 - x) * (8 - y)),
83                         ((    x) * (8 - y)),
84                         ((8 - x) * (    y)),
85                         ((    x) * (    y))};
86    register int i;
87    vec_u8 fperm;
88    const vec_s32 vABCD = vec_ld(0, ABCD);
89    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
90    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
91    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
92    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
93    LOAD_ZERO;
94    const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
95    const vec_u16 v6us = vec_splat_u16(6);
96    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
97    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
98
99    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
100    vec_u8 vsrc0uc, vsrc1uc;
101    vec_s16 vsrc0ssH, vsrc1ssH;
102    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
103    vec_s16 vsrc2ssH, vsrc3ssH, psum;
104    vec_u8 vdst, ppsum, vfdst, fsum;
105
106  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
107
108    if (((unsigned long)dst) % 16 == 0) {
109        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
110                         0x14, 0x15, 0x16, 0x17,
111                         0x08, 0x09, 0x0A, 0x0B,
112                         0x0C, 0x0D, 0x0E, 0x0F};
113    } else {
114        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
115                         0x04, 0x05, 0x06, 0x07,
116                         0x18, 0x19, 0x1A, 0x1B,
117                         0x1C, 0x1D, 0x1E, 0x1F};
118    }
119
120    vsrcAuc = vec_ld(0, src);
121
122    if (loadSecond)
123        vsrcBuc = vec_ld(16, src);
124    vsrcperm0 = vec_lvsl(0, src);
125    vsrcperm1 = vec_lvsl(1, src);
126
127    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
128    if (reallyBadAlign)
129        vsrc1uc = vsrcBuc;
130    else
131        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
132
133    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
134    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
135
136    if (ABCD[3]) {
137        if (!loadSecond) {// -> !reallyBadAlign
138            for (i = 0 ; i < h ; i++) {
139                vsrcCuc = vec_ld(stride + 0, src);
140                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
141                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
142
143                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
144            }
145        } else {
146            vec_u8 vsrcDuc;
147            for (i = 0 ; i < h ; i++) {
148                vsrcCuc = vec_ld(stride + 0, src);
149                vsrcDuc = vec_ld(stride + 16, src);
150                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
151                if (reallyBadAlign)
152                    vsrc3uc = vsrcDuc;
153                else
154                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
155
156                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
157            }
158        }
159    } else {
160        const vec_s16 vE = vec_add(vB, vC);
161        if (ABCD[2]) { // x == 0 B == 0
162            if (!loadSecond) {// -> !reallyBadAlign
163                for (i = 0 ; i < h ; i++) {
164                    vsrcCuc = vec_ld(stride + 0, src);
165                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
166                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
167
168                    vsrc0uc = vsrc1uc;
169                }
170            } else {
171                vec_u8 vsrcDuc;
172                for (i = 0 ; i < h ; i++) {
173                    vsrcCuc = vec_ld(stride + 0, src);
174                    vsrcDuc = vec_ld(stride + 15, src);
175                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
176                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
177
178                    vsrc0uc = vsrc1uc;
179                }
180            }
181        } else { // y == 0 C == 0
182            if (!loadSecond) {// -> !reallyBadAlign
183                for (i = 0 ; i < h ; i++) {
184                    vsrcCuc = vec_ld(0, src);
185                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
186                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
187
188                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
189                }
190            } else {
191                vec_u8 vsrcDuc;
192                for (i = 0 ; i < h ; i++) {
193                    vsrcCuc = vec_ld(0, src);
194                    vsrcDuc = vec_ld(15, src);
195                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
196                    if (reallyBadAlign)
197                        vsrc1uc = vsrcDuc;
198                    else
199                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
200
201                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
202                }
203            }
204        }
205    }
206    POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
207}
208
209/* this code assume that stride % 16 == 0 */
210static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
211   DECLARE_ALIGNED(16, signed int, ABCD)[4] =
212                        {((8 - x) * (8 - y)),
213                         ((    x) * (8 - y)),
214                         ((8 - x) * (    y)),
215                         ((    x) * (    y))};
216    register int i;
217    vec_u8 fperm;
218    const vec_s32 vABCD = vec_ld(0, ABCD);
219    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
220    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
221    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
222    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
223    LOAD_ZERO;
224    const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
225    const vec_u16 v6us  = vec_splat_u16(6);
226    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
227    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
228
229    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
230    vec_u8 vsrc0uc, vsrc1uc;
231    vec_s16 vsrc0ssH, vsrc1ssH;
232    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
233    vec_s16 vsrc2ssH, vsrc3ssH, psum;
234    vec_u8 vdst, ppsum, vfdst, fsum;
235
236    if (((unsigned long)dst) % 16 == 0) {
237        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
238                         0x14, 0x15, 0x16, 0x17,
239                         0x08, 0x09, 0x0A, 0x0B,
240                         0x0C, 0x0D, 0x0E, 0x0F};
241    } else {
242        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
243                         0x04, 0x05, 0x06, 0x07,
244                         0x18, 0x19, 0x1A, 0x1B,
245                         0x1C, 0x1D, 0x1E, 0x1F};
246    }
247
248    vsrcAuc = vec_ld(0, src);
249
250    if (loadSecond)
251        vsrcBuc = vec_ld(16, src);
252    vsrcperm0 = vec_lvsl(0, src);
253    vsrcperm1 = vec_lvsl(1, src);
254
255    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
256    if (reallyBadAlign)
257        vsrc1uc = vsrcBuc;
258    else
259        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
260
261    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
262    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
263
264    if (!loadSecond) {// -> !reallyBadAlign
265        for (i = 0 ; i < h ; i++) {
266
267
268            vsrcCuc = vec_ld(stride + 0, src);
269
270            vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
271            vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
272
273            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
274        }
275    } else {
276        vec_u8 vsrcDuc;
277        for (i = 0 ; i < h ; i++) {
278            vsrcCuc = vec_ld(stride + 0, src);
279            vsrcDuc = vec_ld(stride + 16, src);
280
281            vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
282            if (reallyBadAlign)
283                vsrc3uc = vsrcDuc;
284            else
285                vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
286
287            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
288        }
289    }
290}
291
292#undef noop
293#undef add28
294#undef CHROMA_MC8_ALTIVEC_CORE
295
296/* this code assume stride % 16 == 0 */
297static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
298    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
299    register int i;
300
301    LOAD_ZERO;
302    const vec_u8 permM2 = vec_lvsl(-2, src);
303    const vec_u8 permM1 = vec_lvsl(-1, src);
304    const vec_u8 permP0 = vec_lvsl(+0, src);
305    const vec_u8 permP1 = vec_lvsl(+1, src);
306    const vec_u8 permP2 = vec_lvsl(+2, src);
307    const vec_u8 permP3 = vec_lvsl(+3, src);
308    const vec_s16 v5ss = vec_splat_s16(5);
309    const vec_u16 v5us = vec_splat_u16(5);
310    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
311    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
312
313    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
314
315    register int align = ((((unsigned long)src) - 2) % 16);
316
317    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
318              srcP2A, srcP2B, srcP3A, srcP3B,
319              srcM1A, srcM1B, srcM2A, srcM2B,
320              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
321              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
322              psumA, psumB, sumA, sumB;
323
324    vec_u8 sum, vdst, fsum;
325
326    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
327
328    for (i = 0 ; i < 16 ; i ++) {
329        vec_u8 srcR1 = vec_ld(-2, src);
330        vec_u8 srcR2 = vec_ld(14, src);
331
332        switch (align) {
333        default: {
334            srcM2 = vec_perm(srcR1, srcR2, permM2);
335            srcM1 = vec_perm(srcR1, srcR2, permM1);
336            srcP0 = vec_perm(srcR1, srcR2, permP0);
337            srcP1 = vec_perm(srcR1, srcR2, permP1);
338            srcP2 = vec_perm(srcR1, srcR2, permP2);
339            srcP3 = vec_perm(srcR1, srcR2, permP3);
340        } break;
341        case 11: {
342            srcM2 = vec_perm(srcR1, srcR2, permM2);
343            srcM1 = vec_perm(srcR1, srcR2, permM1);
344            srcP0 = vec_perm(srcR1, srcR2, permP0);
345            srcP1 = vec_perm(srcR1, srcR2, permP1);
346            srcP2 = vec_perm(srcR1, srcR2, permP2);
347            srcP3 = srcR2;
348        } break;
349        case 12: {
350            vec_u8 srcR3 = vec_ld(30, src);
351            srcM2 = vec_perm(srcR1, srcR2, permM2);
352            srcM1 = vec_perm(srcR1, srcR2, permM1);
353            srcP0 = vec_perm(srcR1, srcR2, permP0);
354            srcP1 = vec_perm(srcR1, srcR2, permP1);
355            srcP2 = srcR2;
356            srcP3 = vec_perm(srcR2, srcR3, permP3);
357        } break;
358        case 13: {
359            vec_u8 srcR3 = vec_ld(30, src);
360            srcM2 = vec_perm(srcR1, srcR2, permM2);
361            srcM1 = vec_perm(srcR1, srcR2, permM1);
362            srcP0 = vec_perm(srcR1, srcR2, permP0);
363            srcP1 = srcR2;
364            srcP2 = vec_perm(srcR2, srcR3, permP2);
365            srcP3 = vec_perm(srcR2, srcR3, permP3);
366        } break;
367        case 14: {
368            vec_u8 srcR3 = vec_ld(30, src);
369            srcM2 = vec_perm(srcR1, srcR2, permM2);
370            srcM1 = vec_perm(srcR1, srcR2, permM1);
371            srcP0 = srcR2;
372            srcP1 = vec_perm(srcR2, srcR3, permP1);
373            srcP2 = vec_perm(srcR2, srcR3, permP2);
374            srcP3 = vec_perm(srcR2, srcR3, permP3);
375        } break;
376        case 15: {
377            vec_u8 srcR3 = vec_ld(30, src);
378            srcM2 = vec_perm(srcR1, srcR2, permM2);
379            srcM1 = srcR2;
380            srcP0 = vec_perm(srcR2, srcR3, permP0);
381            srcP1 = vec_perm(srcR2, srcR3, permP1);
382            srcP2 = vec_perm(srcR2, srcR3, permP2);
383            srcP3 = vec_perm(srcR2, srcR3, permP3);
384        } break;
385        }
386
387        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
388        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
389        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
390        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
391
392        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
393        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
394        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
395        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
396
397        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
398        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
399        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
400        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
401
402        sum1A = vec_adds(srcP0A, srcP1A);
403        sum1B = vec_adds(srcP0B, srcP1B);
404        sum2A = vec_adds(srcM1A, srcP2A);
405        sum2B = vec_adds(srcM1B, srcP2B);
406        sum3A = vec_adds(srcM2A, srcP3A);
407        sum3B = vec_adds(srcM2B, srcP3B);
408
409        pp1A = vec_mladd(sum1A, v20ss, v16ss);
410        pp1B = vec_mladd(sum1B, v20ss, v16ss);
411
412        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
413        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
414
415        pp3A = vec_add(sum3A, pp1A);
416        pp3B = vec_add(sum3B, pp1B);
417
418        psumA = vec_sub(pp3A, pp2A);
419        psumB = vec_sub(pp3B, pp2B);
420
421        sumA = vec_sra(psumA, v5us);
422        sumB = vec_sra(psumB, v5us);
423
424        sum = vec_packsu(sumA, sumB);
425
426        ASSERT_ALIGNED(dst);
427        vdst = vec_ld(0, dst);
428
429        OP_U8_ALTIVEC(fsum, sum, vdst);
430
431        vec_st(fsum, 0, dst);
432
433        src += srcStride;
434        dst += dstStride;
435    }
436    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
437}
438
439/* this code assume stride % 16 == 0 */
440static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
441    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
442
443    register int i;
444
445    LOAD_ZERO;
446    const vec_u8 perm = vec_lvsl(0, src);
447    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
448    const vec_u16 v5us = vec_splat_u16(5);
449    const vec_s16 v5ss = vec_splat_s16(5);
450    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
451
452    uint8_t *srcbis = src - (srcStride * 2);
453
454    const vec_u8 srcM2a = vec_ld(0, srcbis);
455    const vec_u8 srcM2b = vec_ld(16, srcbis);
456    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
457    //srcbis += srcStride;
458    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
459    const vec_u8 srcM1b = vec_ld(16, srcbis);
460    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
461    //srcbis += srcStride;
462    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
463    const vec_u8 srcP0b = vec_ld(16, srcbis);
464    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
465    //srcbis += srcStride;
466    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
467    const vec_u8 srcP1b = vec_ld(16, srcbis);
468    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
469    //srcbis += srcStride;
470    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
471    const vec_u8 srcP2b = vec_ld(16, srcbis);
472    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
473    //srcbis += srcStride;
474
475    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
476    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
477    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
478    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
479    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
480    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
481    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
482    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
483    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
484    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
485
486    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
487              psumA, psumB, sumA, sumB,
488              srcP3ssA, srcP3ssB,
489              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
490
491    vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
492
493    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
494
495    for (i = 0 ; i < 16 ; i++) {
496        srcP3a = vec_ld(0, srcbis += srcStride);
497        srcP3b = vec_ld(16, srcbis);
498        srcP3 = vec_perm(srcP3a, srcP3b, perm);
499        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
500        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
501        //srcbis += srcStride;
502
503        sum1A = vec_adds(srcP0ssA, srcP1ssA);
504        sum1B = vec_adds(srcP0ssB, srcP1ssB);
505        sum2A = vec_adds(srcM1ssA, srcP2ssA);
506        sum2B = vec_adds(srcM1ssB, srcP2ssB);
507        sum3A = vec_adds(srcM2ssA, srcP3ssA);
508        sum3B = vec_adds(srcM2ssB, srcP3ssB);
509
510        srcM2ssA = srcM1ssA;
511        srcM2ssB = srcM1ssB;
512        srcM1ssA = srcP0ssA;
513        srcM1ssB = srcP0ssB;
514        srcP0ssA = srcP1ssA;
515        srcP0ssB = srcP1ssB;
516        srcP1ssA = srcP2ssA;
517        srcP1ssB = srcP2ssB;
518        srcP2ssA = srcP3ssA;
519        srcP2ssB = srcP3ssB;
520
521        pp1A = vec_mladd(sum1A, v20ss, v16ss);
522        pp1B = vec_mladd(sum1B, v20ss, v16ss);
523
524        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
525        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
526
527        pp3A = vec_add(sum3A, pp1A);
528        pp3B = vec_add(sum3B, pp1B);
529
530        psumA = vec_sub(pp3A, pp2A);
531        psumB = vec_sub(pp3B, pp2B);
532
533        sumA = vec_sra(psumA, v5us);
534        sumB = vec_sra(psumB, v5us);
535
536        sum = vec_packsu(sumA, sumB);
537
538        ASSERT_ALIGNED(dst);
539        vdst = vec_ld(0, dst);
540
541        OP_U8_ALTIVEC(fsum, sum, vdst);
542
543        vec_st(fsum, 0, dst);
544
545        dst += dstStride;
546    }
547    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
548}
549
550/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
551static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
552    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
553    register int i;
554    LOAD_ZERO;
555    const vec_u8 permM2 = vec_lvsl(-2, src);
556    const vec_u8 permM1 = vec_lvsl(-1, src);
557    const vec_u8 permP0 = vec_lvsl(+0, src);
558    const vec_u8 permP1 = vec_lvsl(+1, src);
559    const vec_u8 permP2 = vec_lvsl(+2, src);
560    const vec_u8 permP3 = vec_lvsl(+3, src);
561    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
562    const vec_u32 v10ui = vec_splat_u32(10);
563    const vec_s16 v5ss = vec_splat_s16(5);
564    const vec_s16 v1ss = vec_splat_s16(1);
565    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
566    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
567
568    register int align = ((((unsigned long)src) - 2) % 16);
569
570    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
571              srcP2A, srcP2B, srcP3A, srcP3B,
572              srcM1A, srcM1B, srcM2A, srcM2B,
573              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
574              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
575
576    const vec_u8 mperm = (const vec_u8)
577        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
578         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
579    int16_t *tmpbis = tmp;
580
581    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
582              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
583              tmpP2ssA, tmpP2ssB;
584
585    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
586              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
587              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
588              ssumAe, ssumAo, ssumBe, ssumBo;
589    vec_u8 fsum, sumv, sum, vdst;
590    vec_s16 ssume, ssumo;
591
592    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
593    src -= (2 * srcStride);
594    for (i = 0 ; i < 21 ; i ++) {
595        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
596        vec_u8 srcR1 = vec_ld(-2, src);
597        vec_u8 srcR2 = vec_ld(14, src);
598
599        switch (align) {
600        default: {
601            srcM2 = vec_perm(srcR1, srcR2, permM2);
602            srcM1 = vec_perm(srcR1, srcR2, permM1);
603            srcP0 = vec_perm(srcR1, srcR2, permP0);
604            srcP1 = vec_perm(srcR1, srcR2, permP1);
605            srcP2 = vec_perm(srcR1, srcR2, permP2);
606            srcP3 = vec_perm(srcR1, srcR2, permP3);
607        } break;
608        case 11: {
609            srcM2 = vec_perm(srcR1, srcR2, permM2);
610            srcM1 = vec_perm(srcR1, srcR2, permM1);
611            srcP0 = vec_perm(srcR1, srcR2, permP0);
612            srcP1 = vec_perm(srcR1, srcR2, permP1);
613            srcP2 = vec_perm(srcR1, srcR2, permP2);
614            srcP3 = srcR2;
615        } break;
616        case 12: {
617            vec_u8 srcR3 = vec_ld(30, src);
618            srcM2 = vec_perm(srcR1, srcR2, permM2);
619            srcM1 = vec_perm(srcR1, srcR2, permM1);
620            srcP0 = vec_perm(srcR1, srcR2, permP0);
621            srcP1 = vec_perm(srcR1, srcR2, permP1);
622            srcP2 = srcR2;
623            srcP3 = vec_perm(srcR2, srcR3, permP3);
624        } break;
625        case 13: {
626            vec_u8 srcR3 = vec_ld(30, src);
627            srcM2 = vec_perm(srcR1, srcR2, permM2);
628            srcM1 = vec_perm(srcR1, srcR2, permM1);
629            srcP0 = vec_perm(srcR1, srcR2, permP0);
630            srcP1 = srcR2;
631            srcP2 = vec_perm(srcR2, srcR3, permP2);
632            srcP3 = vec_perm(srcR2, srcR3, permP3);
633        } break;
634        case 14: {
635            vec_u8 srcR3 = vec_ld(30, src);
636            srcM2 = vec_perm(srcR1, srcR2, permM2);
637            srcM1 = vec_perm(srcR1, srcR2, permM1);
638            srcP0 = srcR2;
639            srcP1 = vec_perm(srcR2, srcR3, permP1);
640            srcP2 = vec_perm(srcR2, srcR3, permP2);
641            srcP3 = vec_perm(srcR2, srcR3, permP3);
642        } break;
643        case 15: {
644            vec_u8 srcR3 = vec_ld(30, src);
645            srcM2 = vec_perm(srcR1, srcR2, permM2);
646            srcM1 = srcR2;
647            srcP0 = vec_perm(srcR2, srcR3, permP0);
648            srcP1 = vec_perm(srcR2, srcR3, permP1);
649            srcP2 = vec_perm(srcR2, srcR3, permP2);
650            srcP3 = vec_perm(srcR2, srcR3, permP3);
651        } break;
652        }
653
654        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
655        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
656        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
657        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
658
659        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
660        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
661        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
662        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
663
664        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
665        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
666        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
667        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
668
669        sum1A = vec_adds(srcP0A, srcP1A);
670        sum1B = vec_adds(srcP0B, srcP1B);
671        sum2A = vec_adds(srcM1A, srcP2A);
672        sum2B = vec_adds(srcM1B, srcP2B);
673        sum3A = vec_adds(srcM2A, srcP3A);
674        sum3B = vec_adds(srcM2B, srcP3B);
675
676        pp1A = vec_mladd(sum1A, v20ss, sum3A);
677        pp1B = vec_mladd(sum1B, v20ss, sum3B);
678
679        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
680        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
681
682        psumA = vec_sub(pp1A, pp2A);
683        psumB = vec_sub(pp1B, pp2B);
684
685        vec_st(psumA, 0, tmp);
686        vec_st(psumB, 16, tmp);
687
688        src += srcStride;
689        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
690    }
691
692    tmpM2ssA = vec_ld(0, tmpbis);
693    tmpM2ssB = vec_ld(16, tmpbis);
694    tmpbis += tmpStride;
695    tmpM1ssA = vec_ld(0, tmpbis);
696    tmpM1ssB = vec_ld(16, tmpbis);
697    tmpbis += tmpStride;
698    tmpP0ssA = vec_ld(0, tmpbis);
699    tmpP0ssB = vec_ld(16, tmpbis);
700    tmpbis += tmpStride;
701    tmpP1ssA = vec_ld(0, tmpbis);
702    tmpP1ssB = vec_ld(16, tmpbis);
703    tmpbis += tmpStride;
704    tmpP2ssA = vec_ld(0, tmpbis);
705    tmpP2ssB = vec_ld(16, tmpbis);
706    tmpbis += tmpStride;
707
708    for (i = 0 ; i < 16 ; i++) {
709        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
710        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
711
712        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
713        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
714        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
715        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
716        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
717        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
718
719        tmpbis += tmpStride;
720
721        tmpM2ssA = tmpM1ssA;
722        tmpM2ssB = tmpM1ssB;
723        tmpM1ssA = tmpP0ssA;
724        tmpM1ssB = tmpP0ssB;
725        tmpP0ssA = tmpP1ssA;
726        tmpP0ssB = tmpP1ssB;
727        tmpP1ssA = tmpP2ssA;
728        tmpP1ssB = tmpP2ssB;
729        tmpP2ssA = tmpP3ssA;
730        tmpP2ssB = tmpP3ssB;
731
732        pp1Ae = vec_mule(sum1A, v20ss);
733        pp1Ao = vec_mulo(sum1A, v20ss);
734        pp1Be = vec_mule(sum1B, v20ss);
735        pp1Bo = vec_mulo(sum1B, v20ss);
736
737        pp2Ae = vec_mule(sum2A, v5ss);
738        pp2Ao = vec_mulo(sum2A, v5ss);
739        pp2Be = vec_mule(sum2B, v5ss);
740        pp2Bo = vec_mulo(sum2B, v5ss);
741
742        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
743        pp3Ao = vec_mulo(sum3A, v1ss);
744        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
745        pp3Bo = vec_mulo(sum3B, v1ss);
746
747        pp1cAe = vec_add(pp1Ae, v512si);
748        pp1cAo = vec_add(pp1Ao, v512si);
749        pp1cBe = vec_add(pp1Be, v512si);
750        pp1cBo = vec_add(pp1Bo, v512si);
751
752        pp32Ae = vec_sub(pp3Ae, pp2Ae);
753        pp32Ao = vec_sub(pp3Ao, pp2Ao);
754        pp32Be = vec_sub(pp3Be, pp2Be);
755        pp32Bo = vec_sub(pp3Bo, pp2Bo);
756
757        sumAe = vec_add(pp1cAe, pp32Ae);
758        sumAo = vec_add(pp1cAo, pp32Ao);
759        sumBe = vec_add(pp1cBe, pp32Be);
760        sumBo = vec_add(pp1cBo, pp32Bo);
761
762        ssumAe = vec_sra(sumAe, v10ui);
763        ssumAo = vec_sra(sumAo, v10ui);
764        ssumBe = vec_sra(sumBe, v10ui);
765        ssumBo = vec_sra(sumBo, v10ui);
766
767        ssume = vec_packs(ssumAe, ssumBe);
768        ssumo = vec_packs(ssumAo, ssumBo);
769
770        sumv = vec_packsu(ssume, ssumo);
771        sum = vec_perm(sumv, sumv, mperm);
772
773        ASSERT_ALIGNED(dst);
774        vdst = vec_ld(0, dst);
775
776        OP_U8_ALTIVEC(fsum, sum, vdst);
777
778        vec_st(fsum, 0, dst);
779
780        dst += dstStride;
781    }
782    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
783}
784