1/*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21//#define DEBUG_ALIGNMENT
22#ifdef DEBUG_ALIGNMENT
23#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
24#else
25#define ASSERT_ALIGNED(ptr) ;
26#endif
27
28/* this code assume that stride % 16 == 0 */
29
30#define CHROMA_MC8_ALTIVEC_CORE \
31        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
33\
34        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
35        psum = vec_mladd(vB, vsrc1ssH, psum);\
36        psum = vec_mladd(vC, vsrc2ssH, psum);\
37        psum = vec_mladd(vD, vsrc3ssH, psum);\
38        psum = vec_sr(psum, v6us);\
39\
40        vdst = vec_ld(0, dst);\
41        ppsum = (vec_u8)vec_pack(psum, psum);\
42        vfdst = vec_perm(vdst, ppsum, fperm);\
43\
44        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
45\
46        vec_st(fsum, 0, dst);\
47\
48        vsrc0ssH = vsrc2ssH;\
49        vsrc1ssH = vsrc3ssH;\
50\
51        dst += stride;\
52        src += stride;
53
54#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
55\
56        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
57        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
58\
59        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
60        psum = vec_mladd(vE, vsrc1ssH, psum);\
61        psum = vec_sr(psum, v6us);\
62\
63        vdst = vec_ld(0, dst);\
64        ppsum = (vec_u8)vec_pack(psum, psum);\
65        vfdst = vec_perm(vdst, ppsum, fperm);\
66\
67        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
68\
69        vec_st(fsum, 0, dst);\
70\
71        dst += stride;\
72        src += stride;
73
74void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
75                                    int stride, int h, int x, int y) {
76  POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
77    DECLARE_ALIGNED_16(signed int, ABCD[4]) =
78                        {((8 - x) * (8 - y)),
79                         ((    x) * (8 - y)),
80                         ((8 - x) * (    y)),
81                         ((    x) * (    y))};
82    register int i;
83    vec_u8 fperm;
84    const vec_s32 vABCD = vec_ld(0, ABCD);
85    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
86    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
87    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
88    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
89    LOAD_ZERO;
90    const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
91    const vec_u16 v6us = vec_splat_u16(6);
92    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
93    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
94
95    vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
96    vec_u8 vsrc0uc, vsrc1uc;
97    vec_s16 vsrc0ssH, vsrc1ssH;
98    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
99    vec_s16 vsrc2ssH, vsrc3ssH, psum;
100    vec_u8 vdst, ppsum, vfdst, fsum;
101
102  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
103
104    if (((unsigned long)dst) % 16 == 0) {
105        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
106                           0x14, 0x15, 0x16, 0x17,
107                           0x08, 0x09, 0x0A, 0x0B,
108                           0x0C, 0x0D, 0x0E, 0x0F};
109    } else {
110        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
111                           0x04, 0x05, 0x06, 0x07,
112                           0x18, 0x19, 0x1A, 0x1B,
113                           0x1C, 0x1D, 0x1E, 0x1F};
114    }
115
116    vsrcAuc = vec_ld(0, src);
117
118    if (loadSecond)
119        vsrcBuc = vec_ld(16, src);
120    vsrcperm0 = vec_lvsl(0, src);
121    vsrcperm1 = vec_lvsl(1, src);
122
123    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
124    if (reallyBadAlign)
125        vsrc1uc = vsrcBuc;
126    else
127        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
128
129    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
130    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
131
132    if (ABCD[3]) {
133        if (!loadSecond) {// -> !reallyBadAlign
134            for (i = 0 ; i < h ; i++) {
135                vsrcCuc = vec_ld(stride + 0, src);
136                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
137                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
138
139                CHROMA_MC8_ALTIVEC_CORE
140            }
141        } else {
142            vec_u8 vsrcDuc;
143            for (i = 0 ; i < h ; i++) {
144                vsrcCuc = vec_ld(stride + 0, src);
145                vsrcDuc = vec_ld(stride + 16, src);
146                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
147                if (reallyBadAlign)
148                    vsrc3uc = vsrcDuc;
149                else
150                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
151
152                CHROMA_MC8_ALTIVEC_CORE
153            }
154        }
155    } else {
156        const vec_s16 vE = vec_add(vB, vC);
157        if (ABCD[2]) { // x == 0 B == 0
158            if (!loadSecond) {// -> !reallyBadAlign
159                for (i = 0 ; i < h ; i++) {
160                    vsrcCuc = vec_ld(stride + 0, src);
161                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
162                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
163
164                    vsrc0uc = vsrc1uc;
165                }
166            } else {
167                vec_u8 vsrcDuc;
168                for (i = 0 ; i < h ; i++) {
169                    vsrcCuc = vec_ld(stride + 0, src);
170                    vsrcDuc = vec_ld(stride + 15, src);
171                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
172                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
173
174                    vsrc0uc = vsrc1uc;
175                }
176            }
177        } else { // y == 0 C == 0
178            if (!loadSecond) {// -> !reallyBadAlign
179                for (i = 0 ; i < h ; i++) {
180                    vsrcCuc = vec_ld(0, src);
181                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
182                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
183
184                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
185                }
186            } else {
187                vec_u8 vsrcDuc;
188                for (i = 0 ; i < h ; i++) {
189                    vsrcCuc = vec_ld(0, src);
190                    vsrcDuc = vec_ld(15, src);
191                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
192                    if (reallyBadAlign)
193                        vsrc1uc = vsrcDuc;
194                    else
195                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
196
197                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
198                }
199            }
200        }
201    }
202    POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
203}
204
205#undef CHROMA_MC8_ALTIVEC_CORE
206
207/* this code assume stride % 16 == 0 */
208static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
209    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
210    register int i;
211
212    LOAD_ZERO;
213    const vec_u8 permM2 = vec_lvsl(-2, src);
214    const vec_u8 permM1 = vec_lvsl(-1, src);
215    const vec_u8 permP0 = vec_lvsl(+0, src);
216    const vec_u8 permP1 = vec_lvsl(+1, src);
217    const vec_u8 permP2 = vec_lvsl(+2, src);
218    const vec_u8 permP3 = vec_lvsl(+3, src);
219    const vec_s16 v5ss = vec_splat_s16(5);
220    const vec_u16 v5us = vec_splat_u16(5);
221    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
222    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
223
224    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
225
226    register int align = ((((unsigned long)src) - 2) % 16);
227
228    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
229              srcP2A, srcP2B, srcP3A, srcP3B,
230              srcM1A, srcM1B, srcM2A, srcM2B,
231              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
232              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
233              psumA, psumB, sumA, sumB;
234
235    vec_u8 sum, vdst, fsum;
236
237    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
238
239    for (i = 0 ; i < 16 ; i ++) {
240        vec_u8 srcR1 = vec_ld(-2, src);
241        vec_u8 srcR2 = vec_ld(14, src);
242
243        switch (align) {
244        default: {
245            srcM2 = vec_perm(srcR1, srcR2, permM2);
246            srcM1 = vec_perm(srcR1, srcR2, permM1);
247            srcP0 = vec_perm(srcR1, srcR2, permP0);
248            srcP1 = vec_perm(srcR1, srcR2, permP1);
249            srcP2 = vec_perm(srcR1, srcR2, permP2);
250            srcP3 = vec_perm(srcR1, srcR2, permP3);
251        } break;
252        case 11: {
253            srcM2 = vec_perm(srcR1, srcR2, permM2);
254            srcM1 = vec_perm(srcR1, srcR2, permM1);
255            srcP0 = vec_perm(srcR1, srcR2, permP0);
256            srcP1 = vec_perm(srcR1, srcR2, permP1);
257            srcP2 = vec_perm(srcR1, srcR2, permP2);
258            srcP3 = srcR2;
259        } break;
260        case 12: {
261            vec_u8 srcR3 = vec_ld(30, src);
262            srcM2 = vec_perm(srcR1, srcR2, permM2);
263            srcM1 = vec_perm(srcR1, srcR2, permM1);
264            srcP0 = vec_perm(srcR1, srcR2, permP0);
265            srcP1 = vec_perm(srcR1, srcR2, permP1);
266            srcP2 = srcR2;
267            srcP3 = vec_perm(srcR2, srcR3, permP3);
268        } break;
269        case 13: {
270            vec_u8 srcR3 = vec_ld(30, src);
271            srcM2 = vec_perm(srcR1, srcR2, permM2);
272            srcM1 = vec_perm(srcR1, srcR2, permM1);
273            srcP0 = vec_perm(srcR1, srcR2, permP0);
274            srcP1 = srcR2;
275            srcP2 = vec_perm(srcR2, srcR3, permP2);
276            srcP3 = vec_perm(srcR2, srcR3, permP3);
277        } break;
278        case 14: {
279            vec_u8 srcR3 = vec_ld(30, src);
280            srcM2 = vec_perm(srcR1, srcR2, permM2);
281            srcM1 = vec_perm(srcR1, srcR2, permM1);
282            srcP0 = srcR2;
283            srcP1 = vec_perm(srcR2, srcR3, permP1);
284            srcP2 = vec_perm(srcR2, srcR3, permP2);
285            srcP3 = vec_perm(srcR2, srcR3, permP3);
286        } break;
287        case 15: {
288            vec_u8 srcR3 = vec_ld(30, src);
289            srcM2 = vec_perm(srcR1, srcR2, permM2);
290            srcM1 = srcR2;
291            srcP0 = vec_perm(srcR2, srcR3, permP0);
292            srcP1 = vec_perm(srcR2, srcR3, permP1);
293            srcP2 = vec_perm(srcR2, srcR3, permP2);
294            srcP3 = vec_perm(srcR2, srcR3, permP3);
295        } break;
296        }
297
298        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
299        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
300        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
301        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
302
303        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
304        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
305        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
306        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
307
308        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
309        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
310        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
311        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
312
313        sum1A = vec_adds(srcP0A, srcP1A);
314        sum1B = vec_adds(srcP0B, srcP1B);
315        sum2A = vec_adds(srcM1A, srcP2A);
316        sum2B = vec_adds(srcM1B, srcP2B);
317        sum3A = vec_adds(srcM2A, srcP3A);
318        sum3B = vec_adds(srcM2B, srcP3B);
319
320        pp1A = vec_mladd(sum1A, v20ss, v16ss);
321        pp1B = vec_mladd(sum1B, v20ss, v16ss);
322
323        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
324        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
325
326        pp3A = vec_add(sum3A, pp1A);
327        pp3B = vec_add(sum3B, pp1B);
328
329        psumA = vec_sub(pp3A, pp2A);
330        psumB = vec_sub(pp3B, pp2B);
331
332        sumA = vec_sra(psumA, v5us);
333        sumB = vec_sra(psumB, v5us);
334
335        sum = vec_packsu(sumA, sumB);
336
337        ASSERT_ALIGNED(dst);
338        vdst = vec_ld(0, dst);
339
340        OP_U8_ALTIVEC(fsum, sum, vdst);
341
342        vec_st(fsum, 0, dst);
343
344        src += srcStride;
345        dst += dstStride;
346    }
347    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
348}
349
350/* this code assume stride % 16 == 0 */
351static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
352    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
353
354    register int i;
355
356    LOAD_ZERO;
357    const vec_u8 perm = vec_lvsl(0, src);
358    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
359    const vec_u16 v5us = vec_splat_u16(5);
360    const vec_s16 v5ss = vec_splat_s16(5);
361    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
362
363    uint8_t *srcbis = src - (srcStride * 2);
364
365    const vec_u8 srcM2a = vec_ld(0, srcbis);
366    const vec_u8 srcM2b = vec_ld(16, srcbis);
367    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
368    //srcbis += srcStride;
369    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
370    const vec_u8 srcM1b = vec_ld(16, srcbis);
371    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
372    //srcbis += srcStride;
373    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
374    const vec_u8 srcP0b = vec_ld(16, srcbis);
375    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
376    //srcbis += srcStride;
377    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
378    const vec_u8 srcP1b = vec_ld(16, srcbis);
379    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
380    //srcbis += srcStride;
381    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
382    const vec_u8 srcP2b = vec_ld(16, srcbis);
383    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
384    //srcbis += srcStride;
385
386    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
387    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
388    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
389    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
390    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
391    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
392    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
393    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
394    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
395    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
396
397    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
398              psumA, psumB, sumA, sumB,
399              srcP3ssA, srcP3ssB,
400              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
401
402    vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
403
404    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
405
406    for (i = 0 ; i < 16 ; i++) {
407        srcP3a = vec_ld(0, srcbis += srcStride);
408        srcP3b = vec_ld(16, srcbis);
409        srcP3 = vec_perm(srcP3a, srcP3b, perm);
410        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
411        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
412        //srcbis += srcStride;
413
414        sum1A = vec_adds(srcP0ssA, srcP1ssA);
415        sum1B = vec_adds(srcP0ssB, srcP1ssB);
416        sum2A = vec_adds(srcM1ssA, srcP2ssA);
417        sum2B = vec_adds(srcM1ssB, srcP2ssB);
418        sum3A = vec_adds(srcM2ssA, srcP3ssA);
419        sum3B = vec_adds(srcM2ssB, srcP3ssB);
420
421        srcM2ssA = srcM1ssA;
422        srcM2ssB = srcM1ssB;
423        srcM1ssA = srcP0ssA;
424        srcM1ssB = srcP0ssB;
425        srcP0ssA = srcP1ssA;
426        srcP0ssB = srcP1ssB;
427        srcP1ssA = srcP2ssA;
428        srcP1ssB = srcP2ssB;
429        srcP2ssA = srcP3ssA;
430        srcP2ssB = srcP3ssB;
431
432        pp1A = vec_mladd(sum1A, v20ss, v16ss);
433        pp1B = vec_mladd(sum1B, v20ss, v16ss);
434
435        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
436        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
437
438        pp3A = vec_add(sum3A, pp1A);
439        pp3B = vec_add(sum3B, pp1B);
440
441        psumA = vec_sub(pp3A, pp2A);
442        psumB = vec_sub(pp3B, pp2B);
443
444        sumA = vec_sra(psumA, v5us);
445        sumB = vec_sra(psumB, v5us);
446
447        sum = vec_packsu(sumA, sumB);
448
449        ASSERT_ALIGNED(dst);
450        vdst = vec_ld(0, dst);
451
452        OP_U8_ALTIVEC(fsum, sum, vdst);
453
454        vec_st(fsum, 0, dst);
455
456        dst += dstStride;
457    }
458    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
459}
460
461/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
462static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
463    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
464    register int i;
465    LOAD_ZERO;
466    const vec_u8 permM2 = vec_lvsl(-2, src);
467    const vec_u8 permM1 = vec_lvsl(-1, src);
468    const vec_u8 permP0 = vec_lvsl(+0, src);
469    const vec_u8 permP1 = vec_lvsl(+1, src);
470    const vec_u8 permP2 = vec_lvsl(+2, src);
471    const vec_u8 permP3 = vec_lvsl(+3, src);
472    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
473    const vec_u32 v10ui = vec_splat_u32(10);
474    const vec_s16 v5ss = vec_splat_s16(5);
475    const vec_s16 v1ss = vec_splat_s16(1);
476    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
477    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
478
479    register int align = ((((unsigned long)src) - 2) % 16);
480
481    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
482              srcP2A, srcP2B, srcP3A, srcP3B,
483              srcM1A, srcM1B, srcM2A, srcM2B,
484              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
485              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
486
487    const vec_u8 mperm = (const vec_u8)
488        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
489         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
490    int16_t *tmpbis = tmp;
491
492    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
493              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
494              tmpP2ssA, tmpP2ssB;
495
496    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
497              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
498              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
499              ssumAe, ssumAo, ssumBe, ssumBo;
500    vec_u8 fsum, sumv, sum, vdst;
501    vec_s16 ssume, ssumo;
502
503    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
504    src -= (2 * srcStride);
505    for (i = 0 ; i < 21 ; i ++) {
506        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
507        vec_u8 srcR1 = vec_ld(-2, src);
508        vec_u8 srcR2 = vec_ld(14, src);
509
510        switch (align) {
511        default: {
512            srcM2 = vec_perm(srcR1, srcR2, permM2);
513            srcM1 = vec_perm(srcR1, srcR2, permM1);
514            srcP0 = vec_perm(srcR1, srcR2, permP0);
515            srcP1 = vec_perm(srcR1, srcR2, permP1);
516            srcP2 = vec_perm(srcR1, srcR2, permP2);
517            srcP3 = vec_perm(srcR1, srcR2, permP3);
518        } break;
519        case 11: {
520            srcM2 = vec_perm(srcR1, srcR2, permM2);
521            srcM1 = vec_perm(srcR1, srcR2, permM1);
522            srcP0 = vec_perm(srcR1, srcR2, permP0);
523            srcP1 = vec_perm(srcR1, srcR2, permP1);
524            srcP2 = vec_perm(srcR1, srcR2, permP2);
525            srcP3 = srcR2;
526        } break;
527        case 12: {
528            vec_u8 srcR3 = vec_ld(30, src);
529            srcM2 = vec_perm(srcR1, srcR2, permM2);
530            srcM1 = vec_perm(srcR1, srcR2, permM1);
531            srcP0 = vec_perm(srcR1, srcR2, permP0);
532            srcP1 = vec_perm(srcR1, srcR2, permP1);
533            srcP2 = srcR2;
534            srcP3 = vec_perm(srcR2, srcR3, permP3);
535        } break;
536        case 13: {
537            vec_u8 srcR3 = vec_ld(30, src);
538            srcM2 = vec_perm(srcR1, srcR2, permM2);
539            srcM1 = vec_perm(srcR1, srcR2, permM1);
540            srcP0 = vec_perm(srcR1, srcR2, permP0);
541            srcP1 = srcR2;
542            srcP2 = vec_perm(srcR2, srcR3, permP2);
543            srcP3 = vec_perm(srcR2, srcR3, permP3);
544        } break;
545        case 14: {
546            vec_u8 srcR3 = vec_ld(30, src);
547            srcM2 = vec_perm(srcR1, srcR2, permM2);
548            srcM1 = vec_perm(srcR1, srcR2, permM1);
549            srcP0 = srcR2;
550            srcP1 = vec_perm(srcR2, srcR3, permP1);
551            srcP2 = vec_perm(srcR2, srcR3, permP2);
552            srcP3 = vec_perm(srcR2, srcR3, permP3);
553        } break;
554        case 15: {
555            vec_u8 srcR3 = vec_ld(30, src);
556            srcM2 = vec_perm(srcR1, srcR2, permM2);
557            srcM1 = srcR2;
558            srcP0 = vec_perm(srcR2, srcR3, permP0);
559            srcP1 = vec_perm(srcR2, srcR3, permP1);
560            srcP2 = vec_perm(srcR2, srcR3, permP2);
561            srcP3 = vec_perm(srcR2, srcR3, permP3);
562        } break;
563        }
564
565        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
566        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
567        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
568        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
569
570        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
571        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
572        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
573        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
574
575        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
576        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
577        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
578        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
579
580        sum1A = vec_adds(srcP0A, srcP1A);
581        sum1B = vec_adds(srcP0B, srcP1B);
582        sum2A = vec_adds(srcM1A, srcP2A);
583        sum2B = vec_adds(srcM1B, srcP2B);
584        sum3A = vec_adds(srcM2A, srcP3A);
585        sum3B = vec_adds(srcM2B, srcP3B);
586
587        pp1A = vec_mladd(sum1A, v20ss, sum3A);
588        pp1B = vec_mladd(sum1B, v20ss, sum3B);
589
590        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
591        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
592
593        psumA = vec_sub(pp1A, pp2A);
594        psumB = vec_sub(pp1B, pp2B);
595
596        vec_st(psumA, 0, tmp);
597        vec_st(psumB, 16, tmp);
598
599        src += srcStride;
600        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
601    }
602
603    tmpM2ssA = vec_ld(0, tmpbis);
604    tmpM2ssB = vec_ld(16, tmpbis);
605    tmpbis += tmpStride;
606    tmpM1ssA = vec_ld(0, tmpbis);
607    tmpM1ssB = vec_ld(16, tmpbis);
608    tmpbis += tmpStride;
609    tmpP0ssA = vec_ld(0, tmpbis);
610    tmpP0ssB = vec_ld(16, tmpbis);
611    tmpbis += tmpStride;
612    tmpP1ssA = vec_ld(0, tmpbis);
613    tmpP1ssB = vec_ld(16, tmpbis);
614    tmpbis += tmpStride;
615    tmpP2ssA = vec_ld(0, tmpbis);
616    tmpP2ssB = vec_ld(16, tmpbis);
617    tmpbis += tmpStride;
618
619    for (i = 0 ; i < 16 ; i++) {
620        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
621        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
622
623        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
624        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
625        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
626        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
627        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
628        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
629
630        tmpbis += tmpStride;
631
632        tmpM2ssA = tmpM1ssA;
633        tmpM2ssB = tmpM1ssB;
634        tmpM1ssA = tmpP0ssA;
635        tmpM1ssB = tmpP0ssB;
636        tmpP0ssA = tmpP1ssA;
637        tmpP0ssB = tmpP1ssB;
638        tmpP1ssA = tmpP2ssA;
639        tmpP1ssB = tmpP2ssB;
640        tmpP2ssA = tmpP3ssA;
641        tmpP2ssB = tmpP3ssB;
642
643        pp1Ae = vec_mule(sum1A, v20ss);
644        pp1Ao = vec_mulo(sum1A, v20ss);
645        pp1Be = vec_mule(sum1B, v20ss);
646        pp1Bo = vec_mulo(sum1B, v20ss);
647
648        pp2Ae = vec_mule(sum2A, v5ss);
649        pp2Ao = vec_mulo(sum2A, v5ss);
650        pp2Be = vec_mule(sum2B, v5ss);
651        pp2Bo = vec_mulo(sum2B, v5ss);
652
653        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
654        pp3Ao = vec_mulo(sum3A, v1ss);
655        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
656        pp3Bo = vec_mulo(sum3B, v1ss);
657
658        pp1cAe = vec_add(pp1Ae, v512si);
659        pp1cAo = vec_add(pp1Ao, v512si);
660        pp1cBe = vec_add(pp1Be, v512si);
661        pp1cBo = vec_add(pp1Bo, v512si);
662
663        pp32Ae = vec_sub(pp3Ae, pp2Ae);
664        pp32Ao = vec_sub(pp3Ao, pp2Ao);
665        pp32Be = vec_sub(pp3Be, pp2Be);
666        pp32Bo = vec_sub(pp3Bo, pp2Bo);
667
668        sumAe = vec_add(pp1cAe, pp32Ae);
669        sumAo = vec_add(pp1cAo, pp32Ao);
670        sumBe = vec_add(pp1cBe, pp32Be);
671        sumBo = vec_add(pp1cBo, pp32Bo);
672
673        ssumAe = vec_sra(sumAe, v10ui);
674        ssumAo = vec_sra(sumAo, v10ui);
675        ssumBe = vec_sra(sumBe, v10ui);
676        ssumBo = vec_sra(sumBo, v10ui);
677
678        ssume = vec_packs(ssumAe, ssumBe);
679        ssumo = vec_packs(ssumAo, ssumBo);
680
681        sumv = vec_packsu(ssume, ssumo);
682        sum = vec_perm(sumv, sumv, mperm);
683
684        ASSERT_ALIGNED(dst);
685        vdst = vec_ld(0, dst);
686
687        OP_U8_ALTIVEC(fsum, sum, vdst);
688
689        vec_st(fsum, 0, dst);
690
691        dst += dstStride;
692    }
693    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
694}
695