1/*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mem.h"
22
23#ifdef DEBUG
24#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
25#else
26#define ASSERT_ALIGNED(ptr) ;
27#endif
28
29/* this code assume stride % 16 == 0 */
30#ifdef PREFIX_h264_qpel16_h_lowpass_altivec
31static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
32    register int i;
33
34    LOAD_ZERO;
35    const vec_u8 permM2 = vec_lvsl(-2, src);
36    const vec_u8 permM1 = vec_lvsl(-1, src);
37    const vec_u8 permP0 = vec_lvsl(+0, src);
38    const vec_u8 permP1 = vec_lvsl(+1, src);
39    const vec_u8 permP2 = vec_lvsl(+2, src);
40    const vec_u8 permP3 = vec_lvsl(+3, src);
41    const vec_s16 v5ss = vec_splat_s16(5);
42    const vec_u16 v5us = vec_splat_u16(5);
43    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
44    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
45
46    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
47
48    register int align = ((((unsigned long)src) - 2) % 16);
49
50    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
51              srcP2A, srcP2B, srcP3A, srcP3B,
52              srcM1A, srcM1B, srcM2A, srcM2B,
53              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
54              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
55              psumA, psumB, sumA, sumB;
56
57    vec_u8 sum, fsum;
58
59    for (i = 0 ; i < 16 ; i ++) {
60        vec_u8 srcR1 = vec_ld(-2, src);
61        vec_u8 srcR2 = vec_ld(14, src);
62
63        switch (align) {
64        default: {
65            srcM2 = vec_perm(srcR1, srcR2, permM2);
66            srcM1 = vec_perm(srcR1, srcR2, permM1);
67            srcP0 = vec_perm(srcR1, srcR2, permP0);
68            srcP1 = vec_perm(srcR1, srcR2, permP1);
69            srcP2 = vec_perm(srcR1, srcR2, permP2);
70            srcP3 = vec_perm(srcR1, srcR2, permP3);
71        } break;
72        case 11: {
73            srcM2 = vec_perm(srcR1, srcR2, permM2);
74            srcM1 = vec_perm(srcR1, srcR2, permM1);
75            srcP0 = vec_perm(srcR1, srcR2, permP0);
76            srcP1 = vec_perm(srcR1, srcR2, permP1);
77            srcP2 = vec_perm(srcR1, srcR2, permP2);
78            srcP3 = srcR2;
79        } break;
80        case 12: {
81            vec_u8 srcR3 = vec_ld(30, src);
82            srcM2 = vec_perm(srcR1, srcR2, permM2);
83            srcM1 = vec_perm(srcR1, srcR2, permM1);
84            srcP0 = vec_perm(srcR1, srcR2, permP0);
85            srcP1 = vec_perm(srcR1, srcR2, permP1);
86            srcP2 = srcR2;
87            srcP3 = vec_perm(srcR2, srcR3, permP3);
88        } break;
89        case 13: {
90            vec_u8 srcR3 = vec_ld(30, src);
91            srcM2 = vec_perm(srcR1, srcR2, permM2);
92            srcM1 = vec_perm(srcR1, srcR2, permM1);
93            srcP0 = vec_perm(srcR1, srcR2, permP0);
94            srcP1 = srcR2;
95            srcP2 = vec_perm(srcR2, srcR3, permP2);
96            srcP3 = vec_perm(srcR2, srcR3, permP3);
97        } break;
98        case 14: {
99            vec_u8 srcR3 = vec_ld(30, src);
100            srcM2 = vec_perm(srcR1, srcR2, permM2);
101            srcM1 = vec_perm(srcR1, srcR2, permM1);
102            srcP0 = srcR2;
103            srcP1 = vec_perm(srcR2, srcR3, permP1);
104            srcP2 = vec_perm(srcR2, srcR3, permP2);
105            srcP3 = vec_perm(srcR2, srcR3, permP3);
106        } break;
107        case 15: {
108            vec_u8 srcR3 = vec_ld(30, src);
109            srcM2 = vec_perm(srcR1, srcR2, permM2);
110            srcM1 = srcR2;
111            srcP0 = vec_perm(srcR2, srcR3, permP0);
112            srcP1 = vec_perm(srcR2, srcR3, permP1);
113            srcP2 = vec_perm(srcR2, srcR3, permP2);
114            srcP3 = vec_perm(srcR2, srcR3, permP3);
115        } break;
116        }
117
118        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
119        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
120        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
121        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
122
123        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
124        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
125        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
126        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
127
128        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
129        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
130        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
131        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
132
133        sum1A = vec_adds(srcP0A, srcP1A);
134        sum1B = vec_adds(srcP0B, srcP1B);
135        sum2A = vec_adds(srcM1A, srcP2A);
136        sum2B = vec_adds(srcM1B, srcP2B);
137        sum3A = vec_adds(srcM2A, srcP3A);
138        sum3B = vec_adds(srcM2B, srcP3B);
139
140        pp1A = vec_mladd(sum1A, v20ss, v16ss);
141        pp1B = vec_mladd(sum1B, v20ss, v16ss);
142
143        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
144        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
145
146        pp3A = vec_add(sum3A, pp1A);
147        pp3B = vec_add(sum3B, pp1B);
148
149        psumA = vec_sub(pp3A, pp2A);
150        psumB = vec_sub(pp3B, pp2B);
151
152        sumA = vec_sra(psumA, v5us);
153        sumB = vec_sra(psumB, v5us);
154
155        sum = vec_packsu(sumA, sumB);
156
157        ASSERT_ALIGNED(dst);
158
159        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
160
161        vec_st(fsum, 0, dst);
162
163        src += srcStride;
164        dst += dstStride;
165    }
166}
167#endif
168
169/* this code assume stride % 16 == 0 */
170#ifdef PREFIX_h264_qpel16_v_lowpass_altivec
171static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
172    register int i;
173
174    LOAD_ZERO;
175    const vec_u8 perm = vec_lvsl(0, src);
176    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
177    const vec_u16 v5us = vec_splat_u16(5);
178    const vec_s16 v5ss = vec_splat_s16(5);
179    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
180
181    uint8_t *srcbis = src - (srcStride * 2);
182
183    const vec_u8 srcM2a = vec_ld(0, srcbis);
184    const vec_u8 srcM2b = vec_ld(16, srcbis);
185    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
186    //srcbis += srcStride;
187    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
188    const vec_u8 srcM1b = vec_ld(16, srcbis);
189    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
190    //srcbis += srcStride;
191    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
192    const vec_u8 srcP0b = vec_ld(16, srcbis);
193    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
194    //srcbis += srcStride;
195    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
196    const vec_u8 srcP1b = vec_ld(16, srcbis);
197    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
198    //srcbis += srcStride;
199    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
200    const vec_u8 srcP2b = vec_ld(16, srcbis);
201    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
202    //srcbis += srcStride;
203
204    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
205    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
206    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
207    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
208    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
209    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
210    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
211    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
212    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
213    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
214
215    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
216              psumA, psumB, sumA, sumB,
217              srcP3ssA, srcP3ssB,
218              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
219
220    vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
221
222    for (i = 0 ; i < 16 ; i++) {
223        srcP3a = vec_ld(0, srcbis += srcStride);
224        srcP3b = vec_ld(16, srcbis);
225        srcP3 = vec_perm(srcP3a, srcP3b, perm);
226        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
227        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
228        //srcbis += srcStride;
229
230        sum1A = vec_adds(srcP0ssA, srcP1ssA);
231        sum1B = vec_adds(srcP0ssB, srcP1ssB);
232        sum2A = vec_adds(srcM1ssA, srcP2ssA);
233        sum2B = vec_adds(srcM1ssB, srcP2ssB);
234        sum3A = vec_adds(srcM2ssA, srcP3ssA);
235        sum3B = vec_adds(srcM2ssB, srcP3ssB);
236
237        srcM2ssA = srcM1ssA;
238        srcM2ssB = srcM1ssB;
239        srcM1ssA = srcP0ssA;
240        srcM1ssB = srcP0ssB;
241        srcP0ssA = srcP1ssA;
242        srcP0ssB = srcP1ssB;
243        srcP1ssA = srcP2ssA;
244        srcP1ssB = srcP2ssB;
245        srcP2ssA = srcP3ssA;
246        srcP2ssB = srcP3ssB;
247
248        pp1A = vec_mladd(sum1A, v20ss, v16ss);
249        pp1B = vec_mladd(sum1B, v20ss, v16ss);
250
251        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
252        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
253
254        pp3A = vec_add(sum3A, pp1A);
255        pp3B = vec_add(sum3B, pp1B);
256
257        psumA = vec_sub(pp3A, pp2A);
258        psumB = vec_sub(pp3B, pp2B);
259
260        sumA = vec_sra(psumA, v5us);
261        sumB = vec_sra(psumB, v5us);
262
263        sum = vec_packsu(sumA, sumB);
264
265        ASSERT_ALIGNED(dst);
266
267        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
268
269        vec_st(fsum, 0, dst);
270
271        dst += dstStride;
272    }
273}
274#endif
275
276/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
277#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
278static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
279    register int i;
280    LOAD_ZERO;
281    const vec_u8 permM2 = vec_lvsl(-2, src);
282    const vec_u8 permM1 = vec_lvsl(-1, src);
283    const vec_u8 permP0 = vec_lvsl(+0, src);
284    const vec_u8 permP1 = vec_lvsl(+1, src);
285    const vec_u8 permP2 = vec_lvsl(+2, src);
286    const vec_u8 permP3 = vec_lvsl(+3, src);
287    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
288    const vec_u32 v10ui = vec_splat_u32(10);
289    const vec_s16 v5ss = vec_splat_s16(5);
290    const vec_s16 v1ss = vec_splat_s16(1);
291    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
292    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
293
294    register int align = ((((unsigned long)src) - 2) % 16);
295
296    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
297              srcP2A, srcP2B, srcP3A, srcP3B,
298              srcM1A, srcM1B, srcM2A, srcM2B,
299              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
300              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
301
302    const vec_u8 mperm = (const vec_u8)
303        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
304         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
305    int16_t *tmpbis = tmp;
306
307    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
308              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
309              tmpP2ssA, tmpP2ssB;
310
311    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
312              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
313              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
314              ssumAe, ssumAo, ssumBe, ssumBo;
315    vec_u8 fsum, sumv, sum;
316    vec_s16 ssume, ssumo;
317
318    src -= (2 * srcStride);
319    for (i = 0 ; i < 21 ; i ++) {
320        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
321        vec_u8 srcR1 = vec_ld(-2, src);
322        vec_u8 srcR2 = vec_ld(14, src);
323
324        switch (align) {
325        default: {
326            srcM2 = vec_perm(srcR1, srcR2, permM2);
327            srcM1 = vec_perm(srcR1, srcR2, permM1);
328            srcP0 = vec_perm(srcR1, srcR2, permP0);
329            srcP1 = vec_perm(srcR1, srcR2, permP1);
330            srcP2 = vec_perm(srcR1, srcR2, permP2);
331            srcP3 = vec_perm(srcR1, srcR2, permP3);
332        } break;
333        case 11: {
334            srcM2 = vec_perm(srcR1, srcR2, permM2);
335            srcM1 = vec_perm(srcR1, srcR2, permM1);
336            srcP0 = vec_perm(srcR1, srcR2, permP0);
337            srcP1 = vec_perm(srcR1, srcR2, permP1);
338            srcP2 = vec_perm(srcR1, srcR2, permP2);
339            srcP3 = srcR2;
340        } break;
341        case 12: {
342            vec_u8 srcR3 = vec_ld(30, src);
343            srcM2 = vec_perm(srcR1, srcR2, permM2);
344            srcM1 = vec_perm(srcR1, srcR2, permM1);
345            srcP0 = vec_perm(srcR1, srcR2, permP0);
346            srcP1 = vec_perm(srcR1, srcR2, permP1);
347            srcP2 = srcR2;
348            srcP3 = vec_perm(srcR2, srcR3, permP3);
349        } break;
350        case 13: {
351            vec_u8 srcR3 = vec_ld(30, src);
352            srcM2 = vec_perm(srcR1, srcR2, permM2);
353            srcM1 = vec_perm(srcR1, srcR2, permM1);
354            srcP0 = vec_perm(srcR1, srcR2, permP0);
355            srcP1 = srcR2;
356            srcP2 = vec_perm(srcR2, srcR3, permP2);
357            srcP3 = vec_perm(srcR2, srcR3, permP3);
358        } break;
359        case 14: {
360            vec_u8 srcR3 = vec_ld(30, src);
361            srcM2 = vec_perm(srcR1, srcR2, permM2);
362            srcM1 = vec_perm(srcR1, srcR2, permM1);
363            srcP0 = srcR2;
364            srcP1 = vec_perm(srcR2, srcR3, permP1);
365            srcP2 = vec_perm(srcR2, srcR3, permP2);
366            srcP3 = vec_perm(srcR2, srcR3, permP3);
367        } break;
368        case 15: {
369            vec_u8 srcR3 = vec_ld(30, src);
370            srcM2 = vec_perm(srcR1, srcR2, permM2);
371            srcM1 = srcR2;
372            srcP0 = vec_perm(srcR2, srcR3, permP0);
373            srcP1 = vec_perm(srcR2, srcR3, permP1);
374            srcP2 = vec_perm(srcR2, srcR3, permP2);
375            srcP3 = vec_perm(srcR2, srcR3, permP3);
376        } break;
377        }
378
379        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
380        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
381        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
382        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
383
384        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
385        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
386        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
387        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
388
389        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
390        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
391        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
392        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
393
394        sum1A = vec_adds(srcP0A, srcP1A);
395        sum1B = vec_adds(srcP0B, srcP1B);
396        sum2A = vec_adds(srcM1A, srcP2A);
397        sum2B = vec_adds(srcM1B, srcP2B);
398        sum3A = vec_adds(srcM2A, srcP3A);
399        sum3B = vec_adds(srcM2B, srcP3B);
400
401        pp1A = vec_mladd(sum1A, v20ss, sum3A);
402        pp1B = vec_mladd(sum1B, v20ss, sum3B);
403
404        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
405        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
406
407        psumA = vec_sub(pp1A, pp2A);
408        psumB = vec_sub(pp1B, pp2B);
409
410        vec_st(psumA, 0, tmp);
411        vec_st(psumB, 16, tmp);
412
413        src += srcStride;
414        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
415    }
416
417    tmpM2ssA = vec_ld(0, tmpbis);
418    tmpM2ssB = vec_ld(16, tmpbis);
419    tmpbis += tmpStride;
420    tmpM1ssA = vec_ld(0, tmpbis);
421    tmpM1ssB = vec_ld(16, tmpbis);
422    tmpbis += tmpStride;
423    tmpP0ssA = vec_ld(0, tmpbis);
424    tmpP0ssB = vec_ld(16, tmpbis);
425    tmpbis += tmpStride;
426    tmpP1ssA = vec_ld(0, tmpbis);
427    tmpP1ssB = vec_ld(16, tmpbis);
428    tmpbis += tmpStride;
429    tmpP2ssA = vec_ld(0, tmpbis);
430    tmpP2ssB = vec_ld(16, tmpbis);
431    tmpbis += tmpStride;
432
433    for (i = 0 ; i < 16 ; i++) {
434        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
435        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
436
437        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
438        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
439        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
440        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
441        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
442        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
443
444        tmpbis += tmpStride;
445
446        tmpM2ssA = tmpM1ssA;
447        tmpM2ssB = tmpM1ssB;
448        tmpM1ssA = tmpP0ssA;
449        tmpM1ssB = tmpP0ssB;
450        tmpP0ssA = tmpP1ssA;
451        tmpP0ssB = tmpP1ssB;
452        tmpP1ssA = tmpP2ssA;
453        tmpP1ssB = tmpP2ssB;
454        tmpP2ssA = tmpP3ssA;
455        tmpP2ssB = tmpP3ssB;
456
457        pp1Ae = vec_mule(sum1A, v20ss);
458        pp1Ao = vec_mulo(sum1A, v20ss);
459        pp1Be = vec_mule(sum1B, v20ss);
460        pp1Bo = vec_mulo(sum1B, v20ss);
461
462        pp2Ae = vec_mule(sum2A, v5ss);
463        pp2Ao = vec_mulo(sum2A, v5ss);
464        pp2Be = vec_mule(sum2B, v5ss);
465        pp2Bo = vec_mulo(sum2B, v5ss);
466
467        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
468        pp3Ao = vec_mulo(sum3A, v1ss);
469        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
470        pp3Bo = vec_mulo(sum3B, v1ss);
471
472        pp1cAe = vec_add(pp1Ae, v512si);
473        pp1cAo = vec_add(pp1Ao, v512si);
474        pp1cBe = vec_add(pp1Be, v512si);
475        pp1cBo = vec_add(pp1Bo, v512si);
476
477        pp32Ae = vec_sub(pp3Ae, pp2Ae);
478        pp32Ao = vec_sub(pp3Ao, pp2Ao);
479        pp32Be = vec_sub(pp3Be, pp2Be);
480        pp32Bo = vec_sub(pp3Bo, pp2Bo);
481
482        sumAe = vec_add(pp1cAe, pp32Ae);
483        sumAo = vec_add(pp1cAo, pp32Ao);
484        sumBe = vec_add(pp1cBe, pp32Be);
485        sumBo = vec_add(pp1cBo, pp32Bo);
486
487        ssumAe = vec_sra(sumAe, v10ui);
488        ssumAo = vec_sra(sumAo, v10ui);
489        ssumBe = vec_sra(sumBe, v10ui);
490        ssumBo = vec_sra(sumBo, v10ui);
491
492        ssume = vec_packs(ssumAe, ssumBe);
493        ssumo = vec_packs(ssumAo, ssumBo);
494
495        sumv = vec_packsu(ssume, ssumo);
496        sum = vec_perm(sumv, sumv, mperm);
497
498        ASSERT_ALIGNED(dst);
499
500        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
501
502        vec_st(fsum, 0, dst);
503
504        dst += dstStride;
505    }
506}
507#endif
508