1/*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavcodec/dsputil.h"
24
25#include "gcc_fixes.h"
26
27#include "dsputil_ppc.h"
28#include "util_altivec.h"
29#include "types_altivec.h"
30
31int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
32{
33    int i;
34    DECLARE_ALIGNED_16(int, s);
35    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
36    vector unsigned char *tv;
37    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
38    vector unsigned int sad;
39    vector signed int sumdiffs;
40
41    s = 0;
42    sad = (vector unsigned int)vec_splat_u32(0);
43    for (i = 0; i < h; i++) {
44        /* Read unaligned pixels into our vectors. The vectors are as follows:
45           pix1v: pix1[0]-pix1[15]
46           pix2v: pix2[0]-pix2[15]      pix2iv: pix2[1]-pix2[16] */
47        tv = (vector unsigned char *) pix1;
48        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
49
50        tv = (vector unsigned char *) &pix2[0];
51        pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
52
53        tv = (vector unsigned char *) &pix2[1];
54        pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
55
56        /* Calculate the average vector */
57        avgv = vec_avg(pix2v, pix2iv);
58
59        /* Calculate a sum of abs differences vector */
60        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
61
62        /* Add each 4 pixel group together and put 4 results into sad */
63        sad = vec_sum4s(t5, sad);
64
65        pix1 += line_size;
66        pix2 += line_size;
67    }
68    /* Sum up the four partial sums, and put the result into s */
69    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
70    sumdiffs = vec_splat(sumdiffs, 3);
71    vec_ste(sumdiffs, 0, &s);
72
73    return s;
74}
75
76int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
77{
78    int i;
79    DECLARE_ALIGNED_16(int, s);
80    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
81    vector unsigned char *tv;
82    vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
83    vector unsigned int sad;
84    vector signed int sumdiffs;
85    uint8_t *pix3 = pix2 + line_size;
86
87    s = 0;
88    sad = (vector unsigned int)vec_splat_u32(0);
89
90    /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
91       iteration becomes pix2 in the next iteration. We can use this
92       fact to avoid a potentially expensive unaligned read, each
93       time around the loop.
94       Read unaligned pixels into our vectors. The vectors are as follows:
95       pix2v: pix2[0]-pix2[15]
96       Split the pixel vectors into shorts */
97    tv = (vector unsigned char *) &pix2[0];
98    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
99
100    for (i = 0; i < h; i++) {
101        /* Read unaligned pixels into our vectors. The vectors are as follows:
102           pix1v: pix1[0]-pix1[15]
103           pix3v: pix3[0]-pix3[15] */
104        tv = (vector unsigned char *) pix1;
105        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
106
107        tv = (vector unsigned char *) &pix3[0];
108        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
109
110        /* Calculate the average vector */
111        avgv = vec_avg(pix2v, pix3v);
112
113        /* Calculate a sum of abs differences vector */
114        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
115
116        /* Add each 4 pixel group together and put 4 results into sad */
117        sad = vec_sum4s(t5, sad);
118
119        pix1 += line_size;
120        pix2v = pix3v;
121        pix3 += line_size;
122
123    }
124
125    /* Sum up the four partial sums, and put the result into s */
126    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
127    sumdiffs = vec_splat(sumdiffs, 3);
128    vec_ste(sumdiffs, 0, &s);
129    return s;
130}
131
132int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
133{
134    int i;
135    DECLARE_ALIGNED_16(int, s);
136    uint8_t *pix3 = pix2 + line_size;
137    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
138    const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
139    vector unsigned char *tv, avgv, t5;
140    vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
141    vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
142    vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
143    vector unsigned short avghv, avglv;
144    vector unsigned short t1, t2, t3, t4;
145    vector unsigned int sad;
146    vector signed int sumdiffs;
147
148    sad = (vector unsigned int)vec_splat_u32(0);
149
150    s = 0;
151
152    /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
153       iteration becomes pix2 in the next iteration. We can use this
154       fact to avoid a potentially expensive unaligned read, as well
155       as some splitting, and vector addition each time around the loop.
156       Read unaligned pixels into our vectors. The vectors are as follows:
157       pix2v: pix2[0]-pix2[15]  pix2iv: pix2[1]-pix2[16]
158       Split the pixel vectors into shorts */
159    tv = (vector unsigned char *) &pix2[0];
160    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
161
162    tv = (vector unsigned char *) &pix2[1];
163    pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
164
165    pix2hv  = (vector unsigned short) vec_mergeh(zero, pix2v);
166    pix2lv  = (vector unsigned short) vec_mergel(zero, pix2v);
167    pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
168    pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
169    t1 = vec_add(pix2hv, pix2ihv);
170    t2 = vec_add(pix2lv, pix2ilv);
171
172    for (i = 0; i < h; i++) {
173        /* Read unaligned pixels into our vectors. The vectors are as follows:
174           pix1v: pix1[0]-pix1[15]
175           pix3v: pix3[0]-pix3[15]      pix3iv: pix3[1]-pix3[16] */
176        tv = (vector unsigned char *) pix1;
177        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
178
179        tv = (vector unsigned char *) &pix3[0];
180        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
181
182        tv = (vector unsigned char *) &pix3[1];
183        pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
184
185        /* Note that AltiVec does have vec_avg, but this works on vector pairs
186           and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
187           would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
188           Instead, we have to split the pixel vectors into vectors of shorts,
189           and do the averaging by hand. */
190
191        /* Split the pixel vectors into shorts */
192        pix3hv  = (vector unsigned short) vec_mergeh(zero, pix3v);
193        pix3lv  = (vector unsigned short) vec_mergel(zero, pix3v);
194        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
195        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
196
197        /* Do the averaging on them */
198        t3 = vec_add(pix3hv, pix3ihv);
199        t4 = vec_add(pix3lv, pix3ilv);
200
201        avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
202        avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
203
204        /* Pack the shorts back into a result */
205        avgv = vec_pack(avghv, avglv);
206
207        /* Calculate a sum of abs differences vector */
208        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
209
210        /* Add each 4 pixel group together and put 4 results into sad */
211        sad = vec_sum4s(t5, sad);
212
213        pix1 += line_size;
214        pix3 += line_size;
215        /* Transfer the calculated values for pix3 into pix2 */
216        t1 = t3;
217        t2 = t4;
218    }
219    /* Sum up the four partial sums, and put the result into s */
220    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
221    sumdiffs = vec_splat(sumdiffs, 3);
222    vec_ste(sumdiffs, 0, &s);
223
224    return s;
225}
226
227int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
228{
229    int i;
230    DECLARE_ALIGNED_16(int, s);
231    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
232    vector unsigned char perm1, perm2, *pix1v, *pix2v;
233    vector unsigned char t1, t2, t3,t4, t5;
234    vector unsigned int sad;
235    vector signed int sumdiffs;
236
237    sad = (vector unsigned int)vec_splat_u32(0);
238
239
240    for (i = 0; i < h; i++) {
241        /* Read potentially unaligned pixels into t1 and t2 */
242        perm1 = vec_lvsl(0, pix1);
243        pix1v = (vector unsigned char *) pix1;
244        perm2 = vec_lvsl(0, pix2);
245        pix2v = (vector unsigned char *) pix2;
246        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
247        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
248
249        /* Calculate a sum of abs differences vector */
250        t3 = vec_max(t1, t2);
251        t4 = vec_min(t1, t2);
252        t5 = vec_sub(t3, t4);
253
254        /* Add each 4 pixel group together and put 4 results into sad */
255        sad = vec_sum4s(t5, sad);
256
257        pix1 += line_size;
258        pix2 += line_size;
259    }
260
261    /* Sum up the four partial sums, and put the result into s */
262    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
263    sumdiffs = vec_splat(sumdiffs, 3);
264    vec_ste(sumdiffs, 0, &s);
265
266    return s;
267}
268
269int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
270{
271    int i;
272    DECLARE_ALIGNED_16(int, s);
273    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
274    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
275    vector unsigned char t1, t2, t3,t4, t5;
276    vector unsigned int sad;
277    vector signed int sumdiffs;
278
279    sad = (vector unsigned int)vec_splat_u32(0);
280
281    permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
282
283    for (i = 0; i < h; i++) {
284        /* Read potentially unaligned pixels into t1 and t2
285           Since we're reading 16 pixels, and actually only want 8,
286           mask out the last 8 pixels. The 0s don't change the sum. */
287        perm1 = vec_lvsl(0, pix1);
288        pix1v = (vector unsigned char *) pix1;
289        perm2 = vec_lvsl(0, pix2);
290        pix2v = (vector unsigned char *) pix2;
291        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
292        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
293
294        /* Calculate a sum of abs differences vector */
295        t3 = vec_max(t1, t2);
296        t4 = vec_min(t1, t2);
297        t5 = vec_sub(t3, t4);
298
299        /* Add each 4 pixel group together and put 4 results into sad */
300        sad = vec_sum4s(t5, sad);
301
302        pix1 += line_size;
303        pix2 += line_size;
304    }
305
306    /* Sum up the four partial sums, and put the result into s */
307    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
308    sumdiffs = vec_splat(sumdiffs, 3);
309    vec_ste(sumdiffs, 0, &s);
310
311    return s;
312}
313
314int pix_norm1_altivec(uint8_t *pix, int line_size)
315{
316    int i;
317    DECLARE_ALIGNED_16(int, s);
318    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
319    vector unsigned char *tv;
320    vector unsigned char pixv;
321    vector unsigned int sv;
322    vector signed int sum;
323
324    sv = (vector unsigned int)vec_splat_u32(0);
325
326    s = 0;
327    for (i = 0; i < 16; i++) {
328        /* Read in the potentially unaligned pixels */
329        tv = (vector unsigned char *) pix;
330        pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
331
332        /* Square the values, and add them to our sum */
333        sv = vec_msum(pixv, pixv, sv);
334
335        pix += line_size;
336    }
337    /* Sum up the four partial sums, and put the result into s */
338    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
339    sum = vec_splat(sum, 3);
340    vec_ste(sum, 0, &s);
341
342    return s;
343}
344
345/**
346 * Sum of Squared Errors for a 8x8 block.
347 * AltiVec-enhanced.
348 * It's the sad8_altivec code above w/ squaring added.
349 */
350int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
351{
352    int i;
353    DECLARE_ALIGNED_16(int, s);
354    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
355    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
356    vector unsigned char t1, t2, t3,t4, t5;
357    vector unsigned int sum;
358    vector signed int sumsqr;
359
360    sum = (vector unsigned int)vec_splat_u32(0);
361
362    permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
363
364
365    for (i = 0; i < h; i++) {
366        /* Read potentially unaligned pixels into t1 and t2
367           Since we're reading 16 pixels, and actually only want 8,
368           mask out the last 8 pixels. The 0s don't change the sum. */
369        perm1 = vec_lvsl(0, pix1);
370        pix1v = (vector unsigned char *) pix1;
371        perm2 = vec_lvsl(0, pix2);
372        pix2v = (vector unsigned char *) pix2;
373        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
374        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
375
376        /* Since we want to use unsigned chars, we can take advantage
377           of the fact that abs(a-b)^2 = (a-b)^2. */
378
379        /* Calculate abs differences vector */
380        t3 = vec_max(t1, t2);
381        t4 = vec_min(t1, t2);
382        t5 = vec_sub(t3, t4);
383
384        /* Square the values and add them to our sum */
385        sum = vec_msum(t5, t5, sum);
386
387        pix1 += line_size;
388        pix2 += line_size;
389    }
390
391    /* Sum up the four partial sums, and put the result into s */
392    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
393    sumsqr = vec_splat(sumsqr, 3);
394    vec_ste(sumsqr, 0, &s);
395
396    return s;
397}
398
399/**
400 * Sum of Squared Errors for a 16x16 block.
401 * AltiVec-enhanced.
402 * It's the sad16_altivec code above w/ squaring added.
403 */
404int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
405{
406    int i;
407    DECLARE_ALIGNED_16(int, s);
408    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
409    vector unsigned char perm1, perm2, *pix1v, *pix2v;
410    vector unsigned char t1, t2, t3,t4, t5;
411    vector unsigned int sum;
412    vector signed int sumsqr;
413
414    sum = (vector unsigned int)vec_splat_u32(0);
415
416    for (i = 0; i < h; i++) {
417        /* Read potentially unaligned pixels into t1 and t2 */
418        perm1 = vec_lvsl(0, pix1);
419        pix1v = (vector unsigned char *) pix1;
420        perm2 = vec_lvsl(0, pix2);
421        pix2v = (vector unsigned char *) pix2;
422        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
423        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
424
425        /* Since we want to use unsigned chars, we can take advantage
426           of the fact that abs(a-b)^2 = (a-b)^2. */
427
428        /* Calculate abs differences vector */
429        t3 = vec_max(t1, t2);
430        t4 = vec_min(t1, t2);
431        t5 = vec_sub(t3, t4);
432
433        /* Square the values and add them to our sum */
434        sum = vec_msum(t5, t5, sum);
435
436        pix1 += line_size;
437        pix2 += line_size;
438    }
439
440    /* Sum up the four partial sums, and put the result into s */
441    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
442    sumsqr = vec_splat(sumsqr, 3);
443    vec_ste(sumsqr, 0, &s);
444
445    return s;
446}
447
448int pix_sum_altivec(uint8_t * pix, int line_size)
449{
450    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
451    vector unsigned char perm, *pixv;
452    vector unsigned char t1;
453    vector unsigned int sad;
454    vector signed int sumdiffs;
455
456    int i;
457    DECLARE_ALIGNED_16(int, s);
458
459    sad = (vector unsigned int)vec_splat_u32(0);
460
461    for (i = 0; i < 16; i++) {
462        /* Read the potentially unaligned 16 pixels into t1 */
463        perm = vec_lvsl(0, pix);
464        pixv = (vector unsigned char *) pix;
465        t1 = vec_perm(pixv[0], pixv[1], perm);
466
467        /* Add each 4 pixel group together and put 4 results into sad */
468        sad = vec_sum4s(t1, sad);
469
470        pix += line_size;
471    }
472
473    /* Sum up the four partial sums, and put the result into s */
474    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
475    sumdiffs = vec_splat(sumdiffs, 3);
476    vec_ste(sumdiffs, 0, &s);
477
478    return s;
479}
480
481void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
482{
483    int i;
484    vector unsigned char perm, bytes, *pixv;
485    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
486    vector signed short shorts;
487
488    for (i = 0; i < 8; i++) {
489        // Read potentially unaligned pixels.
490        // We're reading 16 pixels, and actually only want 8,
491        // but we simply ignore the extras.
492        perm = vec_lvsl(0, pixels);
493        pixv = (vector unsigned char *) pixels;
494        bytes = vec_perm(pixv[0], pixv[1], perm);
495
496        // convert the bytes into shorts
497        shorts = (vector signed short)vec_mergeh(zero, bytes);
498
499        // save the data to the block, we assume the block is 16-byte aligned
500        vec_st(shorts, i*16, (vector signed short*)block);
501
502        pixels += line_size;
503    }
504}
505
506void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
507        const uint8_t *s2, int stride)
508{
509    int i;
510    vector unsigned char perm, bytes, *pixv;
511    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
512    vector signed short shorts1, shorts2;
513
514    for (i = 0; i < 4; i++) {
515        // Read potentially unaligned pixels
516        // We're reading 16 pixels, and actually only want 8,
517        // but we simply ignore the extras.
518        perm = vec_lvsl(0, s1);
519        pixv = (vector unsigned char *) s1;
520        bytes = vec_perm(pixv[0], pixv[1], perm);
521
522        // convert the bytes into shorts
523        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
524
525        // Do the same for the second block of pixels
526        perm = vec_lvsl(0, s2);
527        pixv = (vector unsigned char *) s2;
528        bytes = vec_perm(pixv[0], pixv[1], perm);
529
530        // convert the bytes into shorts
531        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
532
533        // Do the subtraction
534        shorts1 = vec_sub(shorts1, shorts2);
535
536        // save the data to the block, we assume the block is 16-byte aligned
537        vec_st(shorts1, 0, (vector signed short*)block);
538
539        s1 += stride;
540        s2 += stride;
541        block += 8;
542
543
544        // The code below is a copy of the code above... This is a manual
545        // unroll.
546
547        // Read potentially unaligned pixels
548        // We're reading 16 pixels, and actually only want 8,
549        // but we simply ignore the extras.
550        perm = vec_lvsl(0, s1);
551        pixv = (vector unsigned char *) s1;
552        bytes = vec_perm(pixv[0], pixv[1], perm);
553
554        // convert the bytes into shorts
555        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
556
557        // Do the same for the second block of pixels
558        perm = vec_lvsl(0, s2);
559        pixv = (vector unsigned char *) s2;
560        bytes = vec_perm(pixv[0], pixv[1], perm);
561
562        // convert the bytes into shorts
563        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
564
565        // Do the subtraction
566        shorts1 = vec_sub(shorts1, shorts2);
567
568        // save the data to the block, we assume the block is 16-byte aligned
569        vec_st(shorts1, 0, (vector signed short*)block);
570
571        s1 += stride;
572        s2 += stride;
573        block += 8;
574    }
575}
576
577
578static void clear_block_altivec(DCTELEM *block) {
579    LOAD_ZERO;
580    vec_st(zero_s16v,   0, block);
581    vec_st(zero_s16v,  16, block);
582    vec_st(zero_s16v,  32, block);
583    vec_st(zero_s16v,  48, block);
584    vec_st(zero_s16v,  64, block);
585    vec_st(zero_s16v,  80, block);
586    vec_st(zero_s16v,  96, block);
587    vec_st(zero_s16v, 112, block);
588}
589
590
591void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
592    register int i;
593    register vector unsigned char vdst, vsrc;
594
595    /* dst and src are 16 bytes-aligned (guaranteed) */
596    for (i = 0 ; (i + 15) < w ; i+=16) {
597        vdst = vec_ld(i, (unsigned char*)dst);
598        vsrc = vec_ld(i, (unsigned char*)src);
599        vdst = vec_add(vsrc, vdst);
600        vec_st(vdst, i, (unsigned char*)dst);
601    }
602    /* if w is not a multiple of 16 */
603    for (; (i < w) ; i++) {
604        dst[i] = src[i];
605    }
606}
607
608/* next one assumes that ((line_size % 16) == 0) */
609void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
610{
611POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
612    register vector unsigned char pixelsv1, pixelsv2;
613    register vector unsigned char pixelsv1B, pixelsv2B;
614    register vector unsigned char pixelsv1C, pixelsv2C;
615    register vector unsigned char pixelsv1D, pixelsv2D;
616
617    register vector unsigned char perm = vec_lvsl(0, pixels);
618    int i;
619    register int line_size_2 = line_size << 1;
620    register int line_size_3 = line_size + line_size_2;
621    register int line_size_4 = line_size << 2;
622
623POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
624// hand-unrolling the loop by 4 gains about 15%
625// mininum execution time goes from 74 to 60 cycles
626// it's faster than -funroll-loops, but using
627// -funroll-loops w/ this is bad - 74 cycles again.
628// all this is on a 7450, tuning for the 7450
629#if 0
630    for (i = 0; i < h; i++) {
631        pixelsv1 = vec_ld(0, (unsigned char*)pixels);
632        pixelsv2 = vec_ld(16, (unsigned char*)pixels);
633        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
634               0, (unsigned char*)block);
635        pixels+=line_size;
636        block +=line_size;
637    }
638#else
639    for (i = 0; i < h; i += 4) {
640        pixelsv1 = vec_ld(0, (unsigned char*)pixels);
641        pixelsv2 = vec_ld(15, (unsigned char*)pixels);
642        pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
643        pixelsv2B = vec_ld(15 + line_size, (unsigned char*)pixels);
644        pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
645        pixelsv2C = vec_ld(15 + line_size_2, (unsigned char*)pixels);
646        pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
647        pixelsv2D = vec_ld(15 + line_size_3, (unsigned char*)pixels);
648        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
649               0, (unsigned char*)block);
650        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
651               line_size, (unsigned char*)block);
652        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
653               line_size_2, (unsigned char*)block);
654        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
655               line_size_3, (unsigned char*)block);
656        pixels+=line_size_4;
657        block +=line_size_4;
658    }
659#endif
660POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
661}
662
663/* next one assumes that ((line_size % 16) == 0) */
664#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
665void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
666{
667POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
668    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
669    register vector unsigned char perm = vec_lvsl(0, pixels);
670    int i;
671
672POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
673
674    for (i = 0; i < h; i++) {
675        pixelsv1 = vec_ld(0, (unsigned char*)pixels);
676        pixelsv2 = vec_ld(16, (unsigned char*)pixels);
677        blockv = vec_ld(0, block);
678        pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
679        blockv = vec_avg(blockv,pixelsv);
680        vec_st(blockv, 0, (unsigned char*)block);
681        pixels+=line_size;
682        block +=line_size;
683    }
684
685POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
686}
687
688/* next one assumes that ((line_size % 8) == 0) */
689void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
690{
691POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
692    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
693    int i;
694
695POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
696
697   for (i = 0; i < h; i++) {
698       /* block is 8 bytes-aligned, so we're either in the
699          left block (16 bytes-aligned) or in the right block (not) */
700       int rightside = ((unsigned long)block & 0x0000000F);
701
702       blockv = vec_ld(0, block);
703       pixelsv1 = vec_ld(0, (unsigned char*)pixels);
704       pixelsv2 = vec_ld(16, (unsigned char*)pixels);
705       pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
706
707       if (rightside) {
708           pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
709       } else {
710           pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
711       }
712
713       blockv = vec_avg(blockv, pixelsv);
714
715       vec_st(blockv, 0, block);
716
717       pixels += line_size;
718       block += line_size;
719   }
720
721POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
722}
723
724/* next one assumes that ((line_size % 8) == 0) */
725void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
726{
727POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
728    register int i;
729    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
730    register vector unsigned char blockv, temp1, temp2;
731    register vector unsigned short pixelssum1, pixelssum2, temp3;
732    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
733    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
734
735    temp1 = vec_ld(0, pixels);
736    temp2 = vec_ld(16, pixels);
737    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
738    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
739        pixelsv2 = temp2;
740    } else {
741        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
742    }
743    pixelsv1 = vec_mergeh(vczero, pixelsv1);
744    pixelsv2 = vec_mergeh(vczero, pixelsv2);
745    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
746                         (vector unsigned short)pixelsv2);
747    pixelssum1 = vec_add(pixelssum1, vctwo);
748
749POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
750    for (i = 0; i < h ; i++) {
751        int rightside = ((unsigned long)block & 0x0000000F);
752        blockv = vec_ld(0, block);
753
754        temp1 = vec_ld(line_size, pixels);
755        temp2 = vec_ld(line_size + 16, pixels);
756        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
757        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
758            pixelsv2 = temp2;
759        } else {
760            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
761        }
762
763        pixelsv1 = vec_mergeh(vczero, pixelsv1);
764        pixelsv2 = vec_mergeh(vczero, pixelsv2);
765        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
766                             (vector unsigned short)pixelsv2);
767        temp3 = vec_add(pixelssum1, pixelssum2);
768        temp3 = vec_sra(temp3, vctwo);
769        pixelssum1 = vec_add(pixelssum2, vctwo);
770        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
771
772        if (rightside) {
773            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
774        } else {
775            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
776        }
777
778        vec_st(blockv, 0, block);
779
780        block += line_size;
781        pixels += line_size;
782    }
783
784POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
785}
786
787/* next one assumes that ((line_size % 8) == 0) */
788void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
789{
790POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
791    register int i;
792    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
793    register vector unsigned char blockv, temp1, temp2;
794    register vector unsigned short pixelssum1, pixelssum2, temp3;
795    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
796    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
797    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
798
799    temp1 = vec_ld(0, pixels);
800    temp2 = vec_ld(16, pixels);
801    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
802    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
803        pixelsv2 = temp2;
804    } else {
805        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
806    }
807    pixelsv1 = vec_mergeh(vczero, pixelsv1);
808    pixelsv2 = vec_mergeh(vczero, pixelsv2);
809    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
810                         (vector unsigned short)pixelsv2);
811    pixelssum1 = vec_add(pixelssum1, vcone);
812
813POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
814    for (i = 0; i < h ; i++) {
815        int rightside = ((unsigned long)block & 0x0000000F);
816        blockv = vec_ld(0, block);
817
818        temp1 = vec_ld(line_size, pixels);
819        temp2 = vec_ld(line_size + 16, pixels);
820        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
821        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
822            pixelsv2 = temp2;
823        } else {
824            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
825        }
826
827        pixelsv1 = vec_mergeh(vczero, pixelsv1);
828        pixelsv2 = vec_mergeh(vczero, pixelsv2);
829        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
830                             (vector unsigned short)pixelsv2);
831        temp3 = vec_add(pixelssum1, pixelssum2);
832        temp3 = vec_sra(temp3, vctwo);
833        pixelssum1 = vec_add(pixelssum2, vcone);
834        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
835
836        if (rightside) {
837            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
838        } else {
839            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
840        }
841
842        vec_st(blockv, 0, block);
843
844        block += line_size;
845        pixels += line_size;
846    }
847
848POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
849}
850
851/* next one assumes that ((line_size % 16) == 0) */
852void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
853{
854POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
855    register int i;
856    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
857    register vector unsigned char blockv, temp1, temp2;
858    register vector unsigned short temp3, temp4,
859        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
860    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
861    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
862
863POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
864
865    temp1 = vec_ld(0, pixels);
866    temp2 = vec_ld(16, pixels);
867    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
868    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
869        pixelsv2 = temp2;
870    } else {
871        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
872    }
873    pixelsv3 = vec_mergel(vczero, pixelsv1);
874    pixelsv4 = vec_mergel(vczero, pixelsv2);
875    pixelsv1 = vec_mergeh(vczero, pixelsv1);
876    pixelsv2 = vec_mergeh(vczero, pixelsv2);
877    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
878                         (vector unsigned short)pixelsv4);
879    pixelssum3 = vec_add(pixelssum3, vctwo);
880    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
881                         (vector unsigned short)pixelsv2);
882    pixelssum1 = vec_add(pixelssum1, vctwo);
883
884    for (i = 0; i < h ; i++) {
885        blockv = vec_ld(0, block);
886
887        temp1 = vec_ld(line_size, pixels);
888        temp2 = vec_ld(line_size + 16, pixels);
889        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
890        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
891            pixelsv2 = temp2;
892        } else {
893            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
894        }
895
896        pixelsv3 = vec_mergel(vczero, pixelsv1);
897        pixelsv4 = vec_mergel(vczero, pixelsv2);
898        pixelsv1 = vec_mergeh(vczero, pixelsv1);
899        pixelsv2 = vec_mergeh(vczero, pixelsv2);
900
901        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
902                             (vector unsigned short)pixelsv4);
903        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
904                             (vector unsigned short)pixelsv2);
905        temp4 = vec_add(pixelssum3, pixelssum4);
906        temp4 = vec_sra(temp4, vctwo);
907        temp3 = vec_add(pixelssum1, pixelssum2);
908        temp3 = vec_sra(temp3, vctwo);
909
910        pixelssum3 = vec_add(pixelssum4, vctwo);
911        pixelssum1 = vec_add(pixelssum2, vctwo);
912
913        blockv = vec_packsu(temp3, temp4);
914
915        vec_st(blockv, 0, block);
916
917        block += line_size;
918        pixels += line_size;
919    }
920
921POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
922}
923
924/* next one assumes that ((line_size % 16) == 0) */
925void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
926{
927POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
928    register int i;
929    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
930    register vector unsigned char blockv, temp1, temp2;
931    register vector unsigned short temp3, temp4,
932        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
933    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
934    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
935    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
936
937POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
938
939    temp1 = vec_ld(0, pixels);
940    temp2 = vec_ld(16, pixels);
941    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
942    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
943        pixelsv2 = temp2;
944    } else {
945        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
946    }
947    pixelsv3 = vec_mergel(vczero, pixelsv1);
948    pixelsv4 = vec_mergel(vczero, pixelsv2);
949    pixelsv1 = vec_mergeh(vczero, pixelsv1);
950    pixelsv2 = vec_mergeh(vczero, pixelsv2);
951    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
952                         (vector unsigned short)pixelsv4);
953    pixelssum3 = vec_add(pixelssum3, vcone);
954    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
955                         (vector unsigned short)pixelsv2);
956    pixelssum1 = vec_add(pixelssum1, vcone);
957
958    for (i = 0; i < h ; i++) {
959        blockv = vec_ld(0, block);
960
961        temp1 = vec_ld(line_size, pixels);
962        temp2 = vec_ld(line_size + 16, pixels);
963        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
964        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
965            pixelsv2 = temp2;
966        } else {
967            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
968        }
969
970        pixelsv3 = vec_mergel(vczero, pixelsv1);
971        pixelsv4 = vec_mergel(vczero, pixelsv2);
972        pixelsv1 = vec_mergeh(vczero, pixelsv1);
973        pixelsv2 = vec_mergeh(vczero, pixelsv2);
974
975        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
976                             (vector unsigned short)pixelsv4);
977        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
978                             (vector unsigned short)pixelsv2);
979        temp4 = vec_add(pixelssum3, pixelssum4);
980        temp4 = vec_sra(temp4, vctwo);
981        temp3 = vec_add(pixelssum1, pixelssum2);
982        temp3 = vec_sra(temp3, vctwo);
983
984        pixelssum3 = vec_add(pixelssum4, vcone);
985        pixelssum1 = vec_add(pixelssum2, vcone);
986
987        blockv = vec_packsu(temp3, temp4);
988
989        vec_st(blockv, 0, block);
990
991        block += line_size;
992        pixels += line_size;
993    }
994
995POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
996}
997
998int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
999POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
1000    int sum;
1001    register const vector unsigned char vzero =
1002                            (const vector unsigned char)vec_splat_u8(0);
1003    register vector signed short temp0, temp1, temp2, temp3, temp4,
1004                                 temp5, temp6, temp7;
1005POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
1006    {
1007    register const vector signed short vprod1 =(const vector signed short)
1008                                               { 1,-1, 1,-1, 1,-1, 1,-1 };
1009    register const vector signed short vprod2 =(const vector signed short)
1010                                               { 1, 1,-1,-1, 1, 1,-1,-1 };
1011    register const vector signed short vprod3 =(const vector signed short)
1012                                               { 1, 1, 1, 1,-1,-1,-1,-1 };
1013    register const vector unsigned char perm1 = (const vector unsigned char)
1014        {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
1015         0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
1016    register const vector unsigned char perm2 = (const vector unsigned char)
1017        {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
1018         0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
1019    register const vector unsigned char perm3 = (const vector unsigned char)
1020        {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1021         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
1022
1023#define ONEITERBUTTERFLY(i, res)                                          \
1024    {                                                                     \
1025    register vector unsigned char src1, src2, srcO;                   \
1026    register vector unsigned char dst1, dst2, dstO;                   \
1027    register vector signed short srcV, dstV;                          \
1028    register vector signed short but0, but1, but2, op1, op2, op3;     \
1029    src1 = vec_ld(stride * i, src);                                   \
1030    src2 = vec_ld((stride * i) + 15, src);                            \
1031    srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src));           \
1032    dst1 = vec_ld(stride * i, dst);                                   \
1033    dst2 = vec_ld((stride * i) + 15, dst);                            \
1034    dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));           \
1035    /* promote the unsigned chars to signed shorts */                 \
1036    /* we're in the 8x8 function, we only care for the first 8 */     \
1037    srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
1038           (vector signed char)srcO);                                 \
1039    dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
1040           (vector signed char)dstO);                                 \
1041    /* subtractions inside the first butterfly */                     \
1042    but0 = vec_sub(srcV, dstV);                                       \
1043    op1  = vec_perm(but0, but0, perm1);                               \
1044    but1 = vec_mladd(but0, vprod1, op1);                              \
1045    op2  = vec_perm(but1, but1, perm2);                               \
1046    but2 = vec_mladd(but1, vprod2, op2);                              \
1047    op3  = vec_perm(but2, but2, perm3);                               \
1048    res  = vec_mladd(but2, vprod3, op3);                              \
1049    }
1050    ONEITERBUTTERFLY(0, temp0);
1051    ONEITERBUTTERFLY(1, temp1);
1052    ONEITERBUTTERFLY(2, temp2);
1053    ONEITERBUTTERFLY(3, temp3);
1054    ONEITERBUTTERFLY(4, temp4);
1055    ONEITERBUTTERFLY(5, temp5);
1056    ONEITERBUTTERFLY(6, temp6);
1057    ONEITERBUTTERFLY(7, temp7);
1058    }
1059#undef ONEITERBUTTERFLY
1060    {
1061    register vector signed int vsum;
1062    register vector signed short line0 = vec_add(temp0, temp1);
1063    register vector signed short line1 = vec_sub(temp0, temp1);
1064    register vector signed short line2 = vec_add(temp2, temp3);
1065    register vector signed short line3 = vec_sub(temp2, temp3);
1066    register vector signed short line4 = vec_add(temp4, temp5);
1067    register vector signed short line5 = vec_sub(temp4, temp5);
1068    register vector signed short line6 = vec_add(temp6, temp7);
1069    register vector signed short line7 = vec_sub(temp6, temp7);
1070
1071    register vector signed short line0B = vec_add(line0, line2);
1072    register vector signed short line2B = vec_sub(line0, line2);
1073    register vector signed short line1B = vec_add(line1, line3);
1074    register vector signed short line3B = vec_sub(line1, line3);
1075    register vector signed short line4B = vec_add(line4, line6);
1076    register vector signed short line6B = vec_sub(line4, line6);
1077    register vector signed short line5B = vec_add(line5, line7);
1078    register vector signed short line7B = vec_sub(line5, line7);
1079
1080    register vector signed short line0C = vec_add(line0B, line4B);
1081    register vector signed short line4C = vec_sub(line0B, line4B);
1082    register vector signed short line1C = vec_add(line1B, line5B);
1083    register vector signed short line5C = vec_sub(line1B, line5B);
1084    register vector signed short line2C = vec_add(line2B, line6B);
1085    register vector signed short line6C = vec_sub(line2B, line6B);
1086    register vector signed short line3C = vec_add(line3B, line7B);
1087    register vector signed short line7C = vec_sub(line3B, line7B);
1088
1089    vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1090    vsum = vec_sum4s(vec_abs(line1C), vsum);
1091    vsum = vec_sum4s(vec_abs(line2C), vsum);
1092    vsum = vec_sum4s(vec_abs(line3C), vsum);
1093    vsum = vec_sum4s(vec_abs(line4C), vsum);
1094    vsum = vec_sum4s(vec_abs(line5C), vsum);
1095    vsum = vec_sum4s(vec_abs(line6C), vsum);
1096    vsum = vec_sum4s(vec_abs(line7C), vsum);
1097    vsum = vec_sums(vsum, (vector signed int)vzero);
1098    vsum = vec_splat(vsum, 3);
1099    vec_ste(vsum, 0, &sum);
1100    }
1101POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
1102    return sum;
1103}
1104
1105/*
110616x8 works with 16 elements; it allows to avoid replicating loads, and
1107give the compiler more rooms for scheduling.  It's only used from
1108inside hadamard8_diff16_altivec.
1109
1110Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT
1111of spill code, it seems gcc (unlike xlc) cannot keep everything in registers
1112by itself. The following code include hand-made registers allocation. It's not
1113clean, but on a 7450 the resulting code is much faster (best case fall from
1114700+ cycles to 550).
1115
1116xlc doesn't add spill code, but it doesn't know how to schedule for the 7450,
1117and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less
1118instructions...)
1119
1120On the 970, the hand-made RA is still a win (around 690 vs. around 780), but
1121xlc goes to around 660 on the regular C code...
1122*/
1123
1124static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
1125    int sum;
1126    register vector signed short
1127        temp0 REG_v(v0),
1128        temp1 REG_v(v1),
1129        temp2 REG_v(v2),
1130        temp3 REG_v(v3),
1131        temp4 REG_v(v4),
1132        temp5 REG_v(v5),
1133        temp6 REG_v(v6),
1134        temp7 REG_v(v7);
1135    register vector signed short
1136        temp0S REG_v(v8),
1137        temp1S REG_v(v9),
1138        temp2S REG_v(v10),
1139        temp3S REG_v(v11),
1140        temp4S REG_v(v12),
1141        temp5S REG_v(v13),
1142        temp6S REG_v(v14),
1143        temp7S REG_v(v15);
1144    register const vector unsigned char vzero REG_v(v31)=
1145        (const vector unsigned char)vec_splat_u8(0);
1146    {
1147    register const vector signed short vprod1 REG_v(v16)=
1148        (const vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1 };
1149    register const vector signed short vprod2 REG_v(v17)=
1150        (const vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1 };
1151    register const vector signed short vprod3 REG_v(v18)=
1152        (const vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1 };
1153    register const vector unsigned char perm1 REG_v(v19)=
1154        (const vector unsigned char)
1155        {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
1156         0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
1157    register const vector unsigned char perm2 REG_v(v20)=
1158        (const vector unsigned char)
1159        {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
1160         0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
1161    register const vector unsigned char perm3 REG_v(v21)=
1162        (const vector unsigned char)
1163        {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1164         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
1165
1166#define ONEITERBUTTERFLY(i, res1, res2)                                   \
1167    {                                                                     \
1168    register vector unsigned char src1 REG_v(v22),                    \
1169                                  src2 REG_v(v23),                    \
1170                                  dst1 REG_v(v24),                    \
1171                                  dst2 REG_v(v25),                    \
1172                                  srcO REG_v(v22),                    \
1173                                  dstO REG_v(v23);                    \
1174                                                                      \
1175    register vector signed short  srcV REG_v(v24),                    \
1176                                  dstV REG_v(v25),                    \
1177                                  srcW REG_v(v26),                    \
1178                                  dstW REG_v(v27),                    \
1179                                  but0 REG_v(v28),                    \
1180                                  but0S REG_v(v29),                   \
1181                                  op1 REG_v(v30),                     \
1182                                  but1 REG_v(v22),                    \
1183                                  op1S REG_v(v23),                    \
1184                                  but1S REG_v(v24),                   \
1185                                  op2 REG_v(v25),                     \
1186                                  but2 REG_v(v26),                    \
1187                                  op2S REG_v(v27),                    \
1188                                  but2S REG_v(v28),                   \
1189                                  op3 REG_v(v29),                     \
1190                                  op3S REG_v(v30);                    \
1191                                                                      \
1192    src1 = vec_ld(stride * i, src);                                   \
1193    src2 = vec_ld((stride * i) + 16, src);                            \
1194    srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src));           \
1195    dst1 = vec_ld(stride * i, dst);                                   \
1196    dst2 = vec_ld((stride * i) + 16, dst);                            \
1197    dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));           \
1198    /* promote the unsigned chars to signed shorts */                 \
1199    srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
1200           (vector signed char)srcO);                                 \
1201    dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
1202           (vector signed char)dstO);                                 \
1203    srcW = (vector signed short)vec_mergel((vector signed char)vzero, \
1204           (vector signed char)srcO);                                 \
1205    dstW = (vector signed short)vec_mergel((vector signed char)vzero, \
1206           (vector signed char)dstO);                                 \
1207    /* subtractions inside the first butterfly */                     \
1208    but0 = vec_sub(srcV, dstV);                                       \
1209    but0S = vec_sub(srcW, dstW);                                      \
1210    op1 = vec_perm(but0, but0, perm1);                                \
1211    but1 = vec_mladd(but0, vprod1, op1);                              \
1212    op1S = vec_perm(but0S, but0S, perm1);                             \
1213    but1S = vec_mladd(but0S, vprod1, op1S);                           \
1214    op2 = vec_perm(but1, but1, perm2);                                \
1215    but2 = vec_mladd(but1, vprod2, op2);                              \
1216    op2S = vec_perm(but1S, but1S, perm2);                             \
1217    but2S = vec_mladd(but1S, vprod2, op2S);                           \
1218    op3 = vec_perm(but2, but2, perm3);                                \
1219    res1 = vec_mladd(but2, vprod3, op3);                              \
1220    op3S = vec_perm(but2S, but2S, perm3);                             \
1221    res2 = vec_mladd(but2S, vprod3, op3S);                            \
1222    }
1223    ONEITERBUTTERFLY(0, temp0, temp0S);
1224    ONEITERBUTTERFLY(1, temp1, temp1S);
1225    ONEITERBUTTERFLY(2, temp2, temp2S);
1226    ONEITERBUTTERFLY(3, temp3, temp3S);
1227    ONEITERBUTTERFLY(4, temp4, temp4S);
1228    ONEITERBUTTERFLY(5, temp5, temp5S);
1229    ONEITERBUTTERFLY(6, temp6, temp6S);
1230    ONEITERBUTTERFLY(7, temp7, temp7S);
1231    }
1232#undef ONEITERBUTTERFLY
1233    {
1234    register vector signed int vsum;
1235    register vector signed short line0S, line1S, line2S, line3S, line4S,
1236                                 line5S, line6S, line7S, line0BS,line2BS,
1237                                 line1BS,line3BS,line4BS,line6BS,line5BS,
1238                                 line7BS,line0CS,line4CS,line1CS,line5CS,
1239                                 line2CS,line6CS,line3CS,line7CS;
1240
1241    register vector signed short line0 = vec_add(temp0, temp1);
1242    register vector signed short line1 = vec_sub(temp0, temp1);
1243    register vector signed short line2 = vec_add(temp2, temp3);
1244    register vector signed short line3 = vec_sub(temp2, temp3);
1245    register vector signed short line4 = vec_add(temp4, temp5);
1246    register vector signed short line5 = vec_sub(temp4, temp5);
1247    register vector signed short line6 = vec_add(temp6, temp7);
1248    register vector signed short line7 = vec_sub(temp6, temp7);
1249
1250    register vector signed short line0B = vec_add(line0, line2);
1251    register vector signed short line2B = vec_sub(line0, line2);
1252    register vector signed short line1B = vec_add(line1, line3);
1253    register vector signed short line3B = vec_sub(line1, line3);
1254    register vector signed short line4B = vec_add(line4, line6);
1255    register vector signed short line6B = vec_sub(line4, line6);
1256    register vector signed short line5B = vec_add(line5, line7);
1257    register vector signed short line7B = vec_sub(line5, line7);
1258
1259    register vector signed short line0C = vec_add(line0B, line4B);
1260    register vector signed short line4C = vec_sub(line0B, line4B);
1261    register vector signed short line1C = vec_add(line1B, line5B);
1262    register vector signed short line5C = vec_sub(line1B, line5B);
1263    register vector signed short line2C = vec_add(line2B, line6B);
1264    register vector signed short line6C = vec_sub(line2B, line6B);
1265    register vector signed short line3C = vec_add(line3B, line7B);
1266    register vector signed short line7C = vec_sub(line3B, line7B);
1267
1268    vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1269    vsum = vec_sum4s(vec_abs(line1C), vsum);
1270    vsum = vec_sum4s(vec_abs(line2C), vsum);
1271    vsum = vec_sum4s(vec_abs(line3C), vsum);
1272    vsum = vec_sum4s(vec_abs(line4C), vsum);
1273    vsum = vec_sum4s(vec_abs(line5C), vsum);
1274    vsum = vec_sum4s(vec_abs(line6C), vsum);
1275    vsum = vec_sum4s(vec_abs(line7C), vsum);
1276
1277    line0S = vec_add(temp0S, temp1S);
1278    line1S = vec_sub(temp0S, temp1S);
1279    line2S = vec_add(temp2S, temp3S);
1280    line3S = vec_sub(temp2S, temp3S);
1281    line4S = vec_add(temp4S, temp5S);
1282    line5S = vec_sub(temp4S, temp5S);
1283    line6S = vec_add(temp6S, temp7S);
1284    line7S = vec_sub(temp6S, temp7S);
1285
1286    line0BS = vec_add(line0S, line2S);
1287    line2BS = vec_sub(line0S, line2S);
1288    line1BS = vec_add(line1S, line3S);
1289    line3BS = vec_sub(line1S, line3S);
1290    line4BS = vec_add(line4S, line6S);
1291    line6BS = vec_sub(line4S, line6S);
1292    line5BS = vec_add(line5S, line7S);
1293    line7BS = vec_sub(line5S, line7S);
1294
1295    line0CS = vec_add(line0BS, line4BS);
1296    line4CS = vec_sub(line0BS, line4BS);
1297    line1CS = vec_add(line1BS, line5BS);
1298    line5CS = vec_sub(line1BS, line5BS);
1299    line2CS = vec_add(line2BS, line6BS);
1300    line6CS = vec_sub(line2BS, line6BS);
1301    line3CS = vec_add(line3BS, line7BS);
1302    line7CS = vec_sub(line3BS, line7BS);
1303
1304    vsum = vec_sum4s(vec_abs(line0CS), vsum);
1305    vsum = vec_sum4s(vec_abs(line1CS), vsum);
1306    vsum = vec_sum4s(vec_abs(line2CS), vsum);
1307    vsum = vec_sum4s(vec_abs(line3CS), vsum);
1308    vsum = vec_sum4s(vec_abs(line4CS), vsum);
1309    vsum = vec_sum4s(vec_abs(line5CS), vsum);
1310    vsum = vec_sum4s(vec_abs(line6CS), vsum);
1311    vsum = vec_sum4s(vec_abs(line7CS), vsum);
1312    vsum = vec_sums(vsum, (vector signed int)vzero);
1313    vsum = vec_splat(vsum, 3);
1314    vec_ste(vsum, 0, &sum);
1315    }
1316    return sum;
1317}
1318
1319int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1320POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
1321    int score;
1322POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
1323    score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
1324    if (h==16) {
1325        dst += 8*stride;
1326        src += 8*stride;
1327        score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
1328    }
1329POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
1330    return score;
1331}
1332
1333static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
1334                                            int blocksize)
1335{
1336    int i;
1337    vector float m, a;
1338    vector bool int t0, t1;
1339    const vector unsigned int v_31 = //XXX
1340        vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
1341    for (i = 0; i < blocksize; i += 4) {
1342        m = vec_ld(0, mag+i);
1343        a = vec_ld(0, ang+i);
1344        t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
1345        t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
1346        a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
1347        t0 = (vector bool int)vec_and(a, t1);
1348        t1 = (vector bool int)vec_andc(a, t1);
1349        a = vec_sub(m, (vector float)t1);
1350        m = vec_add(m, (vector float)t0);
1351        vec_stl(a, 0, ang+i);
1352        vec_stl(m, 0, mag+i);
1353    }
1354}
1355
1356/* next one assumes that ((line_size % 8) == 0) */
1357void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
1358{
1359POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
1360    register int i;
1361    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
1362    register vector unsigned char blockv, temp1, temp2, blocktemp;
1363    register vector unsigned short pixelssum1, pixelssum2, temp3;
1364
1365    register const vector unsigned char vczero = (const vector unsigned char)
1366                                        vec_splat_u8(0);
1367    register const vector unsigned short vctwo = (const vector unsigned short)
1368                                        vec_splat_u16(2);
1369
1370    temp1 = vec_ld(0, pixels);
1371    temp2 = vec_ld(16, pixels);
1372    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1373    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
1374        pixelsv2 = temp2;
1375    } else {
1376        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1377    }
1378    pixelsv1 = vec_mergeh(vczero, pixelsv1);
1379    pixelsv2 = vec_mergeh(vczero, pixelsv2);
1380    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1381                         (vector unsigned short)pixelsv2);
1382    pixelssum1 = vec_add(pixelssum1, vctwo);
1383
1384POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
1385    for (i = 0; i < h ; i++) {
1386        int rightside = ((unsigned long)block & 0x0000000F);
1387        blockv = vec_ld(0, block);
1388
1389        temp1 = vec_ld(line_size, pixels);
1390        temp2 = vec_ld(line_size + 16, pixels);
1391        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1392        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
1393            pixelsv2 = temp2;
1394        } else {
1395            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1396        }
1397
1398        pixelsv1 = vec_mergeh(vczero, pixelsv1);
1399        pixelsv2 = vec_mergeh(vczero, pixelsv2);
1400        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1401                             (vector unsigned short)pixelsv2);
1402        temp3 = vec_add(pixelssum1, pixelssum2);
1403        temp3 = vec_sra(temp3, vctwo);
1404        pixelssum1 = vec_add(pixelssum2, vctwo);
1405        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1406
1407        if (rightside) {
1408            blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1409        } else {
1410            blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1411        }
1412
1413        blockv = vec_avg(blocktemp, blockv);
1414        vec_st(blockv, 0, block);
1415
1416        block += line_size;
1417        pixels += line_size;
1418    }
1419
1420POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
1421}
1422
1423void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
1424{
1425    c->pix_abs[0][1] = sad16_x2_altivec;
1426    c->pix_abs[0][2] = sad16_y2_altivec;
1427    c->pix_abs[0][3] = sad16_xy2_altivec;
1428    c->pix_abs[0][0] = sad16_altivec;
1429    c->pix_abs[1][0] = sad8_altivec;
1430    c->sad[0]= sad16_altivec;
1431    c->sad[1]= sad8_altivec;
1432    c->pix_norm1 = pix_norm1_altivec;
1433    c->sse[1]= sse8_altivec;
1434    c->sse[0]= sse16_altivec;
1435    c->pix_sum = pix_sum_altivec;
1436    c->diff_pixels = diff_pixels_altivec;
1437    c->get_pixels = get_pixels_altivec;
1438    c->clear_block = clear_block_altivec;
1439    c->add_bytes= add_bytes_altivec;
1440    c->put_pixels_tab[0][0] = put_pixels16_altivec;
1441    /* the two functions do the same thing, so use the same code */
1442    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
1443    c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
1444    c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
1445    c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
1446    c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
1447    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
1448    c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
1449    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
1450
1451    c->hadamard8_diff[0] = hadamard8_diff16_altivec;
1452    c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
1453    if (CONFIG_VORBIS_DECODER)
1454        c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
1455}
1456