1/*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "config.h"
24
25#if HAVE_ALTIVEC_H
26#include <altivec.h>
27#endif
28
29#include "libavutil/attributes.h"
30#include "libavutil/cpu.h"
31#include "libavutil/ppc/cpu.h"
32#include "libavutil/ppc/types_altivec.h"
33#include "libavutil/ppc/util_altivec.h"
34#include "libavcodec/hpeldsp.h"
35#include "hpeldsp_altivec.h"
36
37#if HAVE_ALTIVEC
38/* next one assumes that ((line_size % 16) == 0) */
39void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
40{
41    register vector unsigned char pixelsv1, pixelsv2;
42    register vector unsigned char pixelsv1B, pixelsv2B;
43    register vector unsigned char pixelsv1C, pixelsv2C;
44    register vector unsigned char pixelsv1D, pixelsv2D;
45
46    register vector unsigned char perm = vec_lvsl(0, pixels);
47    int i;
48    register ptrdiff_t line_size_2 = line_size << 1;
49    register ptrdiff_t line_size_3 = line_size + line_size_2;
50    register ptrdiff_t line_size_4 = line_size << 2;
51
52// hand-unrolling the loop by 4 gains about 15%
53// mininum execution time goes from 74 to 60 cycles
54// it's faster than -funroll-loops, but using
55// -funroll-loops w/ this is bad - 74 cycles again.
56// all this is on a 7450, tuning for the 7450
57    for (i = 0; i < h; i += 4) {
58        pixelsv1  = vec_ld( 0, pixels);
59        pixelsv2  = vec_ld(15, pixels);
60        pixelsv1B = vec_ld(line_size, pixels);
61        pixelsv2B = vec_ld(15 + line_size, pixels);
62        pixelsv1C = vec_ld(line_size_2, pixels);
63        pixelsv2C = vec_ld(15 + line_size_2, pixels);
64        pixelsv1D = vec_ld(line_size_3, pixels);
65        pixelsv2D = vec_ld(15 + line_size_3, pixels);
66        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
67               0, (unsigned char*)block);
68        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
69               line_size, (unsigned char*)block);
70        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
71               line_size_2, (unsigned char*)block);
72        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
73               line_size_3, (unsigned char*)block);
74        pixels+=line_size_4;
75        block +=line_size_4;
76    }
77}
78
79/* next one assumes that ((line_size % 16) == 0) */
80#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
81void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
82{
83    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
84    register vector unsigned char perm = vec_lvsl(0, pixels);
85    int i;
86
87    for (i = 0; i < h; i++) {
88        pixelsv1 = vec_ld( 0, pixels);
89        pixelsv2 = vec_ld(16,pixels);
90        blockv = vec_ld(0, block);
91        pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
92        blockv = vec_avg(blockv,pixelsv);
93        vec_st(blockv, 0, (unsigned char*)block);
94        pixels+=line_size;
95        block +=line_size;
96    }
97}
98
99/* next one assumes that ((line_size % 8) == 0) */
100static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
101{
102    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
103    int i;
104
105   for (i = 0; i < h; i++) {
106       /* block is 8 bytes-aligned, so we're either in the
107          left block (16 bytes-aligned) or in the right block (not) */
108       int rightside = ((unsigned long)block & 0x0000000F);
109
110       blockv = vec_ld(0, block);
111       pixelsv1 = vec_ld( 0, pixels);
112       pixelsv2 = vec_ld(16, pixels);
113       pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
114
115       if (rightside) {
116           pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
117       } else {
118           pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
119       }
120
121       blockv = vec_avg(blockv, pixelsv);
122
123       vec_st(blockv, 0, block);
124
125       pixels += line_size;
126       block += line_size;
127   }
128}
129
130/* next one assumes that ((line_size % 8) == 0) */
131static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
132{
133    register int i;
134    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
135    register vector unsigned char blockv, temp1, temp2;
136    register vector unsigned short pixelssum1, pixelssum2, temp3;
137    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
138    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
139
140    temp1 = vec_ld(0, pixels);
141    temp2 = vec_ld(16, pixels);
142    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
143    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
144        pixelsv2 = temp2;
145    } else {
146        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
147    }
148    pixelsv1 = vec_mergeh(vczero, pixelsv1);
149    pixelsv2 = vec_mergeh(vczero, pixelsv2);
150    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
151                         (vector unsigned short)pixelsv2);
152    pixelssum1 = vec_add(pixelssum1, vctwo);
153
154    for (i = 0; i < h ; i++) {
155        int rightside = ((unsigned long)block & 0x0000000F);
156        blockv = vec_ld(0, block);
157
158        temp1 = vec_ld(line_size, pixels);
159        temp2 = vec_ld(line_size + 16, pixels);
160        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
161        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
162            pixelsv2 = temp2;
163        } else {
164            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
165        }
166
167        pixelsv1 = vec_mergeh(vczero, pixelsv1);
168        pixelsv2 = vec_mergeh(vczero, pixelsv2);
169        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
170                             (vector unsigned short)pixelsv2);
171        temp3 = vec_add(pixelssum1, pixelssum2);
172        temp3 = vec_sra(temp3, vctwo);
173        pixelssum1 = vec_add(pixelssum2, vctwo);
174        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
175
176        if (rightside) {
177            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
178        } else {
179            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
180        }
181
182        vec_st(blockv, 0, block);
183
184        block += line_size;
185        pixels += line_size;
186    }
187}
188
189/* next one assumes that ((line_size % 8) == 0) */
190static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
191{
192    register int i;
193    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
194    register vector unsigned char blockv, temp1, temp2;
195    register vector unsigned short pixelssum1, pixelssum2, temp3;
196    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
197    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
198    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
199
200    temp1 = vec_ld(0, pixels);
201    temp2 = vec_ld(16, pixels);
202    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
203    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
204        pixelsv2 = temp2;
205    } else {
206        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
207    }
208    pixelsv1 = vec_mergeh(vczero, pixelsv1);
209    pixelsv2 = vec_mergeh(vczero, pixelsv2);
210    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
211                         (vector unsigned short)pixelsv2);
212    pixelssum1 = vec_add(pixelssum1, vcone);
213
214    for (i = 0; i < h ; i++) {
215        int rightside = ((unsigned long)block & 0x0000000F);
216        blockv = vec_ld(0, block);
217
218        temp1 = vec_ld(line_size, pixels);
219        temp2 = vec_ld(line_size + 16, pixels);
220        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
221        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
222            pixelsv2 = temp2;
223        } else {
224            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
225        }
226
227        pixelsv1 = vec_mergeh(vczero, pixelsv1);
228        pixelsv2 = vec_mergeh(vczero, pixelsv2);
229        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
230                             (vector unsigned short)pixelsv2);
231        temp3 = vec_add(pixelssum1, pixelssum2);
232        temp3 = vec_sra(temp3, vctwo);
233        pixelssum1 = vec_add(pixelssum2, vcone);
234        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
235
236        if (rightside) {
237            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
238        } else {
239            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
240        }
241
242        vec_st(blockv, 0, block);
243
244        block += line_size;
245        pixels += line_size;
246    }
247}
248
249/* next one assumes that ((line_size % 16) == 0) */
250static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
251{
252    register int i;
253    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
254    register vector unsigned char blockv, temp1, temp2;
255    register vector unsigned short temp3, temp4,
256        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
257    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
258    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
259
260    temp1 = vec_ld(0, pixels);
261    temp2 = vec_ld(16, pixels);
262    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
263    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
264        pixelsv2 = temp2;
265    } else {
266        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
267    }
268    pixelsv3 = vec_mergel(vczero, pixelsv1);
269    pixelsv4 = vec_mergel(vczero, pixelsv2);
270    pixelsv1 = vec_mergeh(vczero, pixelsv1);
271    pixelsv2 = vec_mergeh(vczero, pixelsv2);
272    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
273                         (vector unsigned short)pixelsv4);
274    pixelssum3 = vec_add(pixelssum3, vctwo);
275    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
276                         (vector unsigned short)pixelsv2);
277    pixelssum1 = vec_add(pixelssum1, vctwo);
278
279    for (i = 0; i < h ; i++) {
280        blockv = vec_ld(0, block);
281
282        temp1 = vec_ld(line_size, pixels);
283        temp2 = vec_ld(line_size + 16, pixels);
284        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
285        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
286            pixelsv2 = temp2;
287        } else {
288            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
289        }
290
291        pixelsv3 = vec_mergel(vczero, pixelsv1);
292        pixelsv4 = vec_mergel(vczero, pixelsv2);
293        pixelsv1 = vec_mergeh(vczero, pixelsv1);
294        pixelsv2 = vec_mergeh(vczero, pixelsv2);
295
296        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
297                             (vector unsigned short)pixelsv4);
298        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
299                             (vector unsigned short)pixelsv2);
300        temp4 = vec_add(pixelssum3, pixelssum4);
301        temp4 = vec_sra(temp4, vctwo);
302        temp3 = vec_add(pixelssum1, pixelssum2);
303        temp3 = vec_sra(temp3, vctwo);
304
305        pixelssum3 = vec_add(pixelssum4, vctwo);
306        pixelssum1 = vec_add(pixelssum2, vctwo);
307
308        blockv = vec_packsu(temp3, temp4);
309
310        vec_st(blockv, 0, block);
311
312        block += line_size;
313        pixels += line_size;
314    }
315}
316
317/* next one assumes that ((line_size % 16) == 0) */
318static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
319{
320    register int i;
321    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
322    register vector unsigned char blockv, temp1, temp2;
323    register vector unsigned short temp3, temp4,
324        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
325    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
326    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
327    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
328
329    temp1 = vec_ld(0, pixels);
330    temp2 = vec_ld(16, pixels);
331    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
332    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
333        pixelsv2 = temp2;
334    } else {
335        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
336    }
337    pixelsv3 = vec_mergel(vczero, pixelsv1);
338    pixelsv4 = vec_mergel(vczero, pixelsv2);
339    pixelsv1 = vec_mergeh(vczero, pixelsv1);
340    pixelsv2 = vec_mergeh(vczero, pixelsv2);
341    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
342                         (vector unsigned short)pixelsv4);
343    pixelssum3 = vec_add(pixelssum3, vcone);
344    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
345                         (vector unsigned short)pixelsv2);
346    pixelssum1 = vec_add(pixelssum1, vcone);
347
348    for (i = 0; i < h ; i++) {
349        blockv = vec_ld(0, block);
350
351        temp1 = vec_ld(line_size, pixels);
352        temp2 = vec_ld(line_size + 16, pixels);
353        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
354        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
355            pixelsv2 = temp2;
356        } else {
357            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
358        }
359
360        pixelsv3 = vec_mergel(vczero, pixelsv1);
361        pixelsv4 = vec_mergel(vczero, pixelsv2);
362        pixelsv1 = vec_mergeh(vczero, pixelsv1);
363        pixelsv2 = vec_mergeh(vczero, pixelsv2);
364
365        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
366                             (vector unsigned short)pixelsv4);
367        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
368                             (vector unsigned short)pixelsv2);
369        temp4 = vec_add(pixelssum3, pixelssum4);
370        temp4 = vec_sra(temp4, vctwo);
371        temp3 = vec_add(pixelssum1, pixelssum2);
372        temp3 = vec_sra(temp3, vctwo);
373
374        pixelssum3 = vec_add(pixelssum4, vcone);
375        pixelssum1 = vec_add(pixelssum2, vcone);
376
377        blockv = vec_packsu(temp3, temp4);
378
379        vec_st(blockv, 0, block);
380
381        block += line_size;
382        pixels += line_size;
383    }
384}
385
386/* next one assumes that ((line_size % 8) == 0) */
387static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
388{
389    register int i;
390    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
391    register vector unsigned char blockv, temp1, temp2, blocktemp;
392    register vector unsigned short pixelssum1, pixelssum2, temp3;
393
394    register const vector unsigned char vczero = (const vector unsigned char)
395                                        vec_splat_u8(0);
396    register const vector unsigned short vctwo = (const vector unsigned short)
397                                        vec_splat_u16(2);
398
399    temp1 = vec_ld(0, pixels);
400    temp2 = vec_ld(16, pixels);
401    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
402    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
403        pixelsv2 = temp2;
404    } else {
405        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
406    }
407    pixelsv1 = vec_mergeh(vczero, pixelsv1);
408    pixelsv2 = vec_mergeh(vczero, pixelsv2);
409    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
410                         (vector unsigned short)pixelsv2);
411    pixelssum1 = vec_add(pixelssum1, vctwo);
412
413    for (i = 0; i < h ; i++) {
414        int rightside = ((unsigned long)block & 0x0000000F);
415        blockv = vec_ld(0, block);
416
417        temp1 = vec_ld(line_size, pixels);
418        temp2 = vec_ld(line_size + 16, pixels);
419        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
420        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
421            pixelsv2 = temp2;
422        } else {
423            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
424        }
425
426        pixelsv1 = vec_mergeh(vczero, pixelsv1);
427        pixelsv2 = vec_mergeh(vczero, pixelsv2);
428        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
429                             (vector unsigned short)pixelsv2);
430        temp3 = vec_add(pixelssum1, pixelssum2);
431        temp3 = vec_sra(temp3, vctwo);
432        pixelssum1 = vec_add(pixelssum2, vctwo);
433        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
434
435        if (rightside) {
436            blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
437        } else {
438            blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
439        }
440
441        blockv = vec_avg(blocktemp, blockv);
442        vec_st(blockv, 0, block);
443
444        block += line_size;
445        pixels += line_size;
446    }
447}
448#endif /* HAVE_ALTIVEC */
449
450av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
451{
452#if HAVE_ALTIVEC
453    if (!PPC_ALTIVEC(av_get_cpu_flags()))
454        return;
455
456    c->avg_pixels_tab[0][0]        = ff_avg_pixels16_altivec;
457    c->avg_pixels_tab[1][0]        = avg_pixels8_altivec;
458    c->avg_pixels_tab[1][3]        = avg_pixels8_xy2_altivec;
459
460    c->put_pixels_tab[0][0]        = ff_put_pixels16_altivec;
461    c->put_pixels_tab[1][3]        = put_pixels8_xy2_altivec;
462    c->put_pixels_tab[0][3]        = put_pixels16_xy2_altivec;
463
464    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
465    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
466    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
467#endif /* HAVE_ALTIVEC */
468}
469