1/*
2 * AltiVec-optimized snow DSP utils
3 * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavcodec/dsputil.h"
23#include "libavcodec/snow.h"
24
25#include "gcc_fixes.h"
26#include "dsputil_altivec.h"
27
28#undef NDEBUG
29#include <assert.h>
30
31
32
33//FIXME remove this replication
34#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
35
36static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
37{
38    int offset;
39    DWTELEM * buffer;
40
41//  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
42
43    assert(buf->data_stack_top >= 0);
44//  assert(!buf->line[line]);
45    if (buf->line[line])
46        return buf->line[line];
47
48    offset = buf->line_width * line;
49    buffer = buf->data_stack[buf->data_stack_top];
50    buf->data_stack_top--;
51    buf->line[line] = buffer;
52
53//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
54
55    return buffer;
56}
57
58
59//altivec code
60
61void ff_snow_horizontal_compose97i_altivec(IDWTELEM *b, int width)
62{
63#if 0
64    const int w2= (width+1)>>1;
65    DECLARE_ALIGNED_16(IDWTELEM, temp[(width>>1)]);
66    const int w_l= (width>>1);
67    const int w_r= w2 - 1;
68    int i;
69    vector signed short t1, t2, x, y, tmp1, tmp2;
70    vector signed short *vbuf, *vtmp;
71    vector unsigned char align;
72
73    { // Lift 0
74        IDWTELEM * const ref = b + w2 - 1;
75        IDWTELEM b_0 = b[0];
76        vector signed short v7 = vec_splat_s16(7);
77        vbuf = (vector signed short *)b;
78
79        tmp1 = vec_ld (0, ref);
80        align = vec_lvsl (0, ref);
81        tmp2 = vec_ld (15, ref);
82        t1 = vec_perm(tmp1, tmp2, align);
83
84        for (i=0; i<w_l-15; i+=16) {
85#if 0
86/*        b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3);
87        b[i+1] = b[i+1] - ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3);
88        b[i+2] = b[i+2] - ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3);
89        b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3);*/
90        b[i+0] = b[i+0] + ((7 * (ref[i+0] + ref[i+1])-1) >> 8);
91#else
92
93        tmp1 = vec_ld (0, ref+8+i);
94        tmp2 = vec_ld (15, ref+8+i);
95
96        t2 = vec_perm(tmp1, tmp2, align);
97
98        y = vec_add(t1, vec_sld(t1,t2,2));
99//        y = vec_add(vec_add(y,y),y);
100
101        tmp1 = vec_ld (0, ref+12+i);
102
103        y = vec_add(y, vec_splat_s32(4));
104        y = vec_sra(y, vec_splat_u32(3));
105
106        tmp2 = vec_ld (15, ref+12+i);
107
108        *vbuf = vec_sub(*vbuf, y);
109
110        t1 = t2;
111
112        vbuf++;
113
114        t2 = vec_perm(tmp1, tmp2, align);
115
116        y = vec_add(t1,vec_sld(t1,t2,4));
117        y = vec_add(vec_add(y,y),y);
118
119        tmp1 = vec_ld (0, ref+12+i);
120
121        y = vec_add(y, vec_splat_s32(4));
122        y = vec_sra(y, vec_splat_u32(3));
123
124        tmp2 = vec_ld (15, ref+12+i);
125
126        *vbuf = vec_sub(*vbuf, y);
127
128        t1=t2;
129
130        vbuf++;
131
132        t2 = vec_perm(tmp1, tmp2, align);
133
134        y = vec_add(t1,vec_sld(t1,t2,4));
135        y = vec_add(vec_add(y,y),y);
136
137        tmp1 = vec_ld (0, ref+16+i);
138
139        y = vec_add(y, vec_splat_s32(4));
140        y = vec_sra(y, vec_splat_u32(3));
141
142        tmp2 = vec_ld (15, ref+16+i);
143
144        *vbuf = vec_sub(*vbuf, y);
145
146        t1=t2;
147
148        t2 = vec_perm(tmp1, tmp2, align);
149
150        y = vec_add(t1,vec_sld(t1,t2,4));
151        y = vec_add(vec_add(y,y),y);
152
153        vbuf++;
154
155        y = vec_add(y, vec_splat_s32(4));
156        y = vec_sra(y, vec_splat_u32(3));
157        *vbuf = vec_sub(*vbuf, y);
158
159        t1=t2;
160
161        vbuf++;
162
163#endif
164
165        }
166
167        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
168        b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
169    }
170
171    { // Lift 1
172        DWTELEM * const dst = b+w2;
173
174        i = 0;
175        for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){
176            dst[i] = dst[i] - (b[i] + b[i + 1]);
177        }
178
179        align = vec_lvsl(0, b+i);
180        tmp1 = vec_ld(0, b+i);
181        vbuf = (vector signed int*) (dst + i);
182        tmp2 = vec_ld(15, b+i);
183
184        t1 = vec_perm(tmp1, tmp2, align);
185
186        for (; i<w_r-3; i+=4) {
187
188#if 0
189            dst[i]   = dst[i]   - (b[i]   + b[i + 1]);
190            dst[i+1] = dst[i+1] - (b[i+1] + b[i + 2]);
191            dst[i+2] = dst[i+2] - (b[i+2] + b[i + 3]);
192            dst[i+3] = dst[i+3] - (b[i+3] + b[i + 4]);
193#else
194
195        tmp1 = vec_ld(0, b+4+i);
196        tmp2 = vec_ld(15, b+4+i);
197
198        t2 = vec_perm(tmp1, tmp2, align);
199
200        y = vec_add(t1, vec_sld(t1,t2,4));
201        *vbuf = vec_sub (*vbuf, y);
202
203        vbuf++;
204
205        t1 = t2;
206
207#endif
208
209        }
210
211        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
212    }
213
214    { // Lift 2
215        DWTELEM * const ref = b+w2 - 1;
216        DWTELEM b_0 = b[0];
217        vbuf= (vector signed int *) b;
218
219        tmp1 = vec_ld (0, ref);
220        align = vec_lvsl (0, ref);
221        tmp2 = vec_ld (15, ref);
222        t1= vec_perm(tmp1, tmp2, align);
223
224        i = 0;
225        for (; i<w_l-15; i+=16) {
226#if 0
227            b[i]   = b[i]   - (((8 -(ref[i]   + ref[i+1])) - (b[i]  <<2)) >> 4);
228            b[i+1] = b[i+1] - (((8 -(ref[i+1] + ref[i+2])) - (b[i+1]<<2)) >> 4);
229            b[i+2] = b[i+2] - (((8 -(ref[i+2] + ref[i+3])) - (b[i+2]<<2)) >> 4);
230            b[i+3] = b[i+3] - (((8 -(ref[i+3] + ref[i+4])) - (b[i+3]<<2)) >> 4);
231#else
232            tmp1 = vec_ld (0, ref+4+i);
233            tmp2 = vec_ld (15, ref+4+i);
234
235            t2 = vec_perm(tmp1, tmp2, align);
236
237            y = vec_add(t1,vec_sld(t1,t2,4));
238            y = vec_sub(vec_splat_s32(8),y);
239
240            tmp1 = vec_ld (0, ref+8+i);
241
242            x = vec_sl(*vbuf,vec_splat_u32(2));
243            y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
244
245            tmp2 = vec_ld (15, ref+8+i);
246
247            *vbuf = vec_sub( *vbuf, y);
248
249            t1 = t2;
250
251            vbuf++;
252
253            t2 = vec_perm(tmp1, tmp2, align);
254
255            y = vec_add(t1,vec_sld(t1,t2,4));
256            y = vec_sub(vec_splat_s32(8),y);
257
258            tmp1 = vec_ld (0, ref+12+i);
259
260            x = vec_sl(*vbuf,vec_splat_u32(2));
261            y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
262
263            tmp2 = vec_ld (15, ref+12+i);
264
265            *vbuf = vec_sub( *vbuf, y);
266
267            t1 = t2;
268
269            vbuf++;
270
271            t2 = vec_perm(tmp1, tmp2, align);
272
273            y = vec_add(t1,vec_sld(t1,t2,4));
274            y = vec_sub(vec_splat_s32(8),y);
275
276            tmp1 = vec_ld (0, ref+16+i);
277
278            x = vec_sl(*vbuf,vec_splat_u32(2));
279            y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
280
281            tmp2 = vec_ld (15, ref+16+i);
282
283            *vbuf = vec_sub( *vbuf, y);
284
285            t1 = t2;
286
287            vbuf++;
288
289            t2 = vec_perm(tmp1, tmp2, align);
290
291            y = vec_add(t1,vec_sld(t1,t2,4));
292            y = vec_sub(vec_splat_s32(8),y);
293
294            t1 = t2;
295
296            x = vec_sl(*vbuf,vec_splat_u32(2));
297            y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
298            *vbuf = vec_sub( *vbuf, y);
299
300            vbuf++;
301
302#endif
303        }
304
305        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
306        b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS);
307    }
308
309    { // Lift 3
310        DWTELEM * const src = b+w2;
311
312        vbuf = (vector signed int *)b;
313        vtmp = (vector signed int *)temp;
314
315        i = 0;
316        align = vec_lvsl(0, src);
317
318        for (; i<w_r-3; i+=4) {
319#if 0
320            temp[i] = src[i] - ((-3*(b[i] + b[i+1]))>>1);
321            temp[i+1] = src[i+1] - ((-3*(b[i+1] + b[i+2]))>>1);
322            temp[i+2] = src[i+2] - ((-3*(b[i+2] + b[i+3]))>>1);
323            temp[i+3] = src[i+3] - ((-3*(b[i+3] + b[i+4]))>>1);
324#else
325            tmp1 = vec_ld(0,src+i);
326            t1 = vec_add(vbuf[0],vec_sld(vbuf[0],vbuf[1],4));
327            tmp2 = vec_ld(15,src+i);
328            t1 = vec_sub(vec_splat_s32(0),t1); //bad!
329            t1 = vec_add(t1,vec_add(t1,t1));
330            t2 = vec_perm(tmp1 ,tmp2 ,align);
331            t1 = vec_sra(t1,vec_splat_u32(1));
332            vbuf++;
333            *vtmp = vec_sub(t2,t1);
334            vtmp++;
335
336#endif
337
338        }
339
340        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -3, 0, 1);
341    }
342
343    {
344    //Interleave
345        int a;
346        vector signed int *t = (vector signed int *)temp,
347                          *v = (vector signed int *)b;
348
349        snow_interleave_line_header(&i, width, b, temp);
350
351        for (; (i & 0xE) != 0xE; i-=2){
352            b[i+1] = temp[i>>1];
353            b[i] = b[i>>1];
354        }
355        for (i-=14; i>=0; i-=16){
356           a=i/4;
357
358           v[a+3]=vec_mergel(v[(a>>1)+1],t[(a>>1)+1]);
359           v[a+2]=vec_mergeh(v[(a>>1)+1],t[(a>>1)+1]);
360           v[a+1]=vec_mergel(v[a>>1],t[a>>1]);
361           v[a]=vec_mergeh(v[a>>1],t[a>>1]);
362
363        }
364
365    }
366#endif
367}
368
369void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width)
370{
371    int i, w4 = width/4;
372    vector signed int *v0, *v1,*v2,*v3,*v4,*v5;
373    vector signed int t1, t2;
374
375    v0=(vector signed int *)b0;
376    v1=(vector signed int *)b1;
377    v2=(vector signed int *)b2;
378    v3=(vector signed int *)b3;
379    v4=(vector signed int *)b4;
380    v5=(vector signed int *)b5;
381
382    for (i=0; i< w4;i++) {
383
384    #if 0
385        b4[i] -= (3*(b3[i] + b5[i])+4)>>3;
386        b3[i] -= ((b2[i] + b4[i]));
387        b2[i] += ((b1[i] + b3[i])+4*b2[i]+8)>>4;
388        b1[i] += (3*(b0[i] + b2[i]))>>1;
389    #else
390        t1 = vec_add(v3[i], v5[i]);
391        t2 = vec_add(t1, vec_add(t1,t1));
392        t1 = vec_add(t2, vec_splat_s32(4));
393        v4[i] = vec_sub(v4[i], vec_sra(t1,vec_splat_u32(3)));
394
395        v3[i] = vec_sub(v3[i], vec_add(v2[i], v4[i]));
396
397        t1 = vec_add(vec_splat_s32(8), vec_add(v1[i], v3[i]));
398        t2 = vec_sl(v2[i], vec_splat_u32(2));
399        v2[i] = vec_add(v2[i], vec_sra(vec_add(t1,t2),vec_splat_u32(4)));
400        t1 = vec_add(v0[i], v2[i]);
401        t2 = vec_add(t1, vec_add(t1,t1));
402        v1[i] = vec_add(v1[i], vec_sra(t2,vec_splat_u32(1)));
403
404    #endif
405    }
406
407    for(i*=4; i < width; i++)
408    {
409        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
410        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
411        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
412        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
413    }
414}
415
416#define LOAD_BLOCKS \
417            tmp1 = vec_ld(0, &block[3][y*src_stride]);\
418            align = vec_lvsl(0, &block[3][y*src_stride]);\
419            tmp2 = vec_ld(15, &block[3][y*src_stride]);\
420\
421            b3 = vec_perm(tmp1,tmp2,align);\
422\
423            tmp1 = vec_ld(0, &block[2][y*src_stride]);\
424            align = vec_lvsl(0, &block[2][y*src_stride]);\
425            tmp2 = vec_ld(15, &block[2][y*src_stride]);\
426\
427            b2 = vec_perm(tmp1,tmp2,align);\
428\
429            tmp1 = vec_ld(0, &block[1][y*src_stride]);\
430            align = vec_lvsl(0, &block[1][y*src_stride]);\
431            tmp2 = vec_ld(15, &block[1][y*src_stride]);\
432\
433            b1 = vec_perm(tmp1,tmp2,align);\
434\
435            tmp1 = vec_ld(0, &block[0][y*src_stride]);\
436            align = vec_lvsl(0, &block[0][y*src_stride]);\
437            tmp2 = vec_ld(15, &block[0][y*src_stride]);\
438\
439            b0 = vec_perm(tmp1,tmp2,align);
440
441#define LOAD_OBMCS \
442            tmp1 = vec_ld(0, obmc1);\
443            align = vec_lvsl(0, obmc1);\
444            tmp2 = vec_ld(15, obmc1);\
445\
446            ob1 = vec_perm(tmp1,tmp2,align);\
447\
448            tmp1 = vec_ld(0, obmc2);\
449            align = vec_lvsl(0, obmc2);\
450            tmp2 = vec_ld(15, obmc2);\
451\
452            ob2 = vec_perm(tmp1,tmp2,align);\
453\
454            tmp1 = vec_ld(0, obmc3);\
455            align = vec_lvsl(0, obmc3);\
456            tmp2 = vec_ld(15, obmc3);\
457\
458            ob3 = vec_perm(tmp1,tmp2,align);\
459\
460            tmp1 = vec_ld(0, obmc4);\
461            align = vec_lvsl(0, obmc4);\
462            tmp2 = vec_ld(15, obmc4);\
463\
464            ob4 = vec_perm(tmp1,tmp2,align);
465
466/* interleave logic
467 * h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ]
468 * h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ]
469 * h  <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ]
470 */
471
472#define STEPS_0_1\
473            h1 = (vector unsigned short)\
474                 vec_mergeh(ob1, ob2);\
475\
476            h2 = (vector unsigned short)\
477                 vec_mergeh(ob3, ob4);\
478\
479            ih = (vector unsigned char)\
480                 vec_mergeh(h1,h2);\
481\
482            l1 = (vector unsigned short) vec_mergeh(b3, b2);\
483\
484            ih1 = (vector unsigned char) vec_mergel(h1, h2);\
485\
486            l2 = (vector unsigned short) vec_mergeh(b1, b0);\
487\
488            il = (vector unsigned char) vec_mergeh(l1, l2);\
489\
490            v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
491\
492            il1 = (vector unsigned char) vec_mergel(l1, l2);\
493\
494            v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
495
496#define FINAL_STEP_SCALAR\
497        for(x=0; x<b_w; x++)\
498            if(add){\
499                vbuf[x] += dst[x + src_x];\
500                vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;\
501                if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\
502                dst8[x + y*src_stride] = vbuf[x];\
503            }else{\
504                dst[x + src_x] -= vbuf[x];\
505            }
506
507static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,
508                                             const int obmc_stride,
509                                             uint8_t * * block, int b_w,
510                                             int b_h, int src_x, int src_y,
511                                             int src_stride, slice_buffer * sb,
512                                             int add, uint8_t * dst8)
513{
514    int y, x;
515    DWTELEM * dst;
516    vector unsigned short h1, h2, l1, l2;
517    vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
518    vector unsigned char b0,b1,b2,b3;
519    vector unsigned char ob1,ob2,ob3,ob4;
520
521    DECLARE_ALIGNED_16(int, vbuf[16]);
522    vector signed int *v = (vector signed int *)vbuf, *d;
523
524    for(y=0; y<b_h; y++){
525        //FIXME ugly misuse of obmc_stride
526
527        uint8_t *obmc1= obmc + y*obmc_stride;
528        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
529        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
530        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
531
532        dst = slice_buffer_get_line(sb, src_y + y);
533        d = (vector signed int *)(dst + src_x);
534
535//FIXME i could avoid some loads!
536
537        // load blocks
538        LOAD_BLOCKS
539
540        // load obmcs
541        LOAD_OBMCS
542
543        // steps 0 1
544        STEPS_0_1
545
546        FINAL_STEP_SCALAR
547
548       }
549
550}
551
552#define STEPS_2_3\
553            h1 = (vector unsigned short) vec_mergel(ob1, ob2);\
554\
555            h2 = (vector unsigned short) vec_mergel(ob3, ob4);\
556\
557            ih = (vector unsigned char) vec_mergeh(h1,h2);\
558\
559            l1 = (vector unsigned short) vec_mergel(b3, b2);\
560\
561            l2 = (vector unsigned short) vec_mergel(b1, b0);\
562\
563            ih1 = (vector unsigned char) vec_mergel(h1,h2);\
564\
565            il = (vector unsigned char) vec_mergeh(l1,l2);\
566\
567            v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
568\
569            il1 = (vector unsigned char) vec_mergel(l1,l2);\
570\
571            v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
572
573
574static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
575                                             const int obmc_stride,
576                                             uint8_t * * block, int b_w,
577                                             int b_h, int src_x, int src_y,
578                                             int src_stride, slice_buffer * sb,
579                                             int add, uint8_t * dst8)
580{
581    int y, x;
582    DWTELEM * dst;
583    vector unsigned short h1, h2, l1, l2;
584    vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
585    vector unsigned char b0,b1,b2,b3;
586    vector unsigned char ob1,ob2,ob3,ob4;
587    DECLARE_ALIGNED_16(int, vbuf[b_w]);
588    vector signed int *v = (vector signed int *)vbuf, *d;
589
590    for(y=0; y<b_h; y++){
591        //FIXME ugly misuse of obmc_stride
592
593        uint8_t *obmc1= obmc + y*obmc_stride;
594        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
595        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
596        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
597
598        dst = slice_buffer_get_line(sb, src_y + y);
599        d = (vector signed int *)(dst + src_x);
600
601        // load blocks
602        LOAD_BLOCKS
603
604        // load obmcs
605        LOAD_OBMCS
606
607        // steps 0 1 2 3
608        STEPS_0_1
609
610        STEPS_2_3
611
612        FINAL_STEP_SCALAR
613
614    }
615}
616
617#define FINAL_STEP_VEC \
618\
619    if(add)\
620        {\
621            for(x=0; x<b_w/4; x++)\
622            {\
623                v[x] = vec_add(v[x], d[x]);\
624                v[x] = vec_sra(vec_add(v[x],\
625                                       vec_sl( vec_splat_s32(1),\
626                                               vec_splat_u32(7))),\
627                               vec_splat_u32(8));\
628\
629                mask = (vector bool int) vec_sl((vector signed int)\
630                        vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\
631                mask = (vector bool int) vec_and(v[x],vec_nor(mask,mask));\
632\
633                mask = (vector bool int)\
634                        vec_cmpeq((vector signed int)mask,\
635                                  (vector signed int)vec_splat_u32(0));\
636\
637                vs = vec_sra(v[x],vec_splat_u32(8));\
638                vs = vec_sra(v[x],vec_splat_u32(8));\
639                vs = vec_sra(v[x],vec_splat_u32(15));\
640\
641                vs = vec_nor(vs,vs);\
642\
643                v[x]= vec_sel(v[x],vs,mask);\
644            }\
645\
646            for(x=0; x<b_w; x++)\
647                dst8[x + y*src_stride] = vbuf[x];\
648\
649        }\
650         else\
651            for(x=0; x<b_w/4; x++)\
652                d[x] = vec_sub(d[x], v[x]);
653
654static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc,
655                                             const int obmc_stride,
656                                             uint8_t * * block, int b_w,
657                                             int b_h, int src_x, int src_y,
658                                             int src_stride, slice_buffer * sb,
659                                             int add, uint8_t * dst8)
660{
661    int y, x;
662    DWTELEM * dst;
663    vector bool int mask;
664    vector signed int vs;
665    vector unsigned short h1, h2, l1, l2;
666    vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
667    vector unsigned char b0,b1,b2,b3;
668    vector unsigned char ob1,ob2,ob3,ob4;
669
670    DECLARE_ALIGNED_16(int, vbuf[16]);
671    vector signed int *v = (vector signed int *)vbuf, *d;
672
673    for(y=0; y<b_h; y++){
674        //FIXME ugly misuse of obmc_stride
675
676        uint8_t *obmc1= obmc + y*obmc_stride;
677        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
678        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
679        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
680
681        dst = slice_buffer_get_line(sb, src_y + y);
682        d = (vector signed int *)(dst + src_x);
683
684//FIXME i could avoid some loads!
685
686        // load blocks
687        LOAD_BLOCKS
688
689        // load obmcs
690        LOAD_OBMCS
691
692        // steps 0 1
693        STEPS_0_1
694
695        FINAL_STEP_VEC
696
697       }
698
699}
700
701static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc,
702                                             const int obmc_stride,
703                                             uint8_t * * block, int b_w,
704                                             int b_h, int src_x, int src_y,
705                                             int src_stride, slice_buffer * sb,
706                                             int add, uint8_t * dst8)
707{
708    int y, x;
709    DWTELEM * dst;
710    vector bool int mask;
711    vector signed int vs;
712    vector unsigned short h1, h2, l1, l2;
713    vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
714    vector unsigned char b0,b1,b2,b3;
715    vector unsigned char ob1,ob2,ob3,ob4;
716    DECLARE_ALIGNED_16(int, vbuf[b_w]);
717    vector signed int *v = (vector signed int *)vbuf, *d;
718
719    for(y=0; y<b_h; y++){
720        //FIXME ugly misuse of obmc_stride
721
722        uint8_t *obmc1= obmc + y*obmc_stride;
723        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
724        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
725        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
726
727        dst = slice_buffer_get_line(sb, src_y + y);
728        d = (vector signed int *)(dst + src_x);
729
730        // load blocks
731        LOAD_BLOCKS
732
733        // load obmcs
734        LOAD_OBMCS
735
736        // steps 0 1 2 3
737        STEPS_0_1
738
739        STEPS_2_3
740
741        FINAL_STEP_VEC
742
743    }
744}
745
746
747void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
748                                      uint8_t * * block, int b_w, int b_h,
749                                      int src_x, int src_y, int src_stride,
750                                      slice_buffer * sb, int add,
751                                      uint8_t * dst8)
752{
753    if (src_x&15) {
754        if (b_w == 16)
755            inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block,
756                                                   b_w, b_h, src_x, src_y,
757                                                   src_stride, sb, add, dst8);
758        else if (b_w == 8)
759            inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
760                                                  b_w, b_h, src_x, src_y,
761                                                  src_stride, sb, add, dst8);
762        else
763            ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
764                                     src_y, src_stride, sb, add, dst8);
765    } else {
766        if (b_w == 16)
767            inner_add_yblock_a_bw_16_obmc_32_altivec(obmc, obmc_stride, block,
768                                                     b_w, b_h, src_x, src_y,
769                                                     src_stride, sb, add, dst8);
770        else if (b_w == 8)
771            inner_add_yblock_a_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
772                                                    b_w, b_h, src_x, src_y,
773                                                    src_stride, sb, add, dst8);
774        else
775            ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
776                                     src_y, src_stride, sb, add, dst8);
777    }
778}
779
780
781void snow_init_altivec(DSPContext* c, AVCodecContext *avctx)
782{
783#if 0
784    c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec;
785    c->vertical_compose97i = ff_snow_vertical_compose97i_altivec;
786    c->inner_add_yblock = ff_snow_inner_add_yblock_altivec;
787#endif
788}
789