1/*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of Libav.
9 *
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25/**
26 * @file
27 * DSP utils
28 */
29
30#include "libavutil/imgutils.h"
31#include "avcodec.h"
32#include "dsputil.h"
33#include "simple_idct.h"
34#include "faandct.h"
35#include "faanidct.h"
36#include "mathops.h"
37#include "mpegvideo.h"
38#include "config.h"
39#include "ac3dec.h"
40#include "vorbis.h"
41#include "png.h"
42
43uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44uint32_t ff_squareTbl[512] = {0, };
45
46#define BIT_DEPTH 9
47#include "dsputil_template.c"
48#undef BIT_DEPTH
49
50#define BIT_DEPTH 10
51#include "dsputil_template.c"
52#undef BIT_DEPTH
53
54#define BIT_DEPTH 8
55#include "dsputil_template.c"
56
57// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58#define pb_7f (~0UL/255 * 0x7f)
59#define pb_80 (~0UL/255 * 0x80)
60
61const uint8_t ff_zigzag_direct[64] = {
62    0,   1,  8, 16,  9,  2,  3, 10,
63    17, 24, 32, 25, 18, 11,  4,  5,
64    12, 19, 26, 33, 40, 48, 41, 34,
65    27, 20, 13,  6,  7, 14, 21, 28,
66    35, 42, 49, 56, 57, 50, 43, 36,
67    29, 22, 15, 23, 30, 37, 44, 51,
68    58, 59, 52, 45, 38, 31, 39, 46,
69    53, 60, 61, 54, 47, 55, 62, 63
70};
71
72/* Specific zigzag scan for 248 idct. NOTE that unlike the
73   specification, we interleave the fields */
74const uint8_t ff_zigzag248_direct[64] = {
75     0,  8,  1,  9, 16, 24,  2, 10,
76    17, 25, 32, 40, 48, 56, 33, 41,
77    18, 26,  3, 11,  4, 12, 19, 27,
78    34, 42, 49, 57, 50, 58, 35, 43,
79    20, 28,  5, 13,  6, 14, 21, 29,
80    36, 44, 51, 59, 52, 60, 37, 45,
81    22, 30,  7, 15, 23, 31, 38, 46,
82    53, 61, 54, 62, 39, 47, 55, 63,
83};
84
85/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
87
88const uint8_t ff_alternate_horizontal_scan[64] = {
89    0,  1,   2,  3,  8,  9, 16, 17,
90    10, 11,  4,  5,  6,  7, 15, 14,
91    13, 12, 19, 18, 24, 25, 32, 33,
92    26, 27, 20, 21, 22, 23, 28, 29,
93    30, 31, 34, 35, 40, 41, 48, 49,
94    42, 43, 36, 37, 38, 39, 44, 45,
95    46, 47, 50, 51, 56, 57, 58, 59,
96    52, 53, 54, 55, 60, 61, 62, 63,
97};
98
99const uint8_t ff_alternate_vertical_scan[64] = {
100    0,  8,  16, 24,  1,  9,  2, 10,
101    17, 25, 32, 40, 48, 56, 57, 49,
102    41, 33, 26, 18,  3, 11,  4, 12,
103    19, 27, 34, 42, 50, 58, 35, 43,
104    51, 59, 20, 28,  5, 13,  6, 14,
105    21, 29, 36, 44, 52, 60, 37, 45,
106    53, 61, 22, 30,  7, 15, 23, 31,
107    38, 46, 54, 62, 39, 47, 55, 63,
108};
109
110/* Input permutation for the simple_idct_mmx */
111static const uint8_t simple_mmx_permutation[64]={
112        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
120};
121
122static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
123
124void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
125    int i;
126    int end;
127
128    st->scantable= src_scantable;
129
130    for(i=0; i<64; i++){
131        int j;
132        j = src_scantable[i];
133        st->permutated[i] = permutation[j];
134#if ARCH_PPC
135        st->inverse[j] = i;
136#endif
137    }
138
139    end=-1;
140    for(i=0; i<64; i++){
141        int j;
142        j = st->permutated[i];
143        if(j>end) end=j;
144        st->raster_end[i]= end;
145    }
146}
147
148void ff_init_scantable_permutation(uint8_t *idct_permutation,
149                                   int idct_permutation_type)
150{
151    int i;
152
153    switch(idct_permutation_type){
154    case FF_NO_IDCT_PERM:
155        for(i=0; i<64; i++)
156            idct_permutation[i]= i;
157        break;
158    case FF_LIBMPEG2_IDCT_PERM:
159        for(i=0; i<64; i++)
160            idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
161        break;
162    case FF_SIMPLE_IDCT_PERM:
163        for(i=0; i<64; i++)
164            idct_permutation[i]= simple_mmx_permutation[i];
165        break;
166    case FF_TRANSPOSE_IDCT_PERM:
167        for(i=0; i<64; i++)
168            idct_permutation[i]= ((i&7)<<3) | (i>>3);
169        break;
170    case FF_PARTTRANS_IDCT_PERM:
171        for(i=0; i<64; i++)
172            idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
173        break;
174    case FF_SSE2_IDCT_PERM:
175        for(i=0; i<64; i++)
176            idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
177        break;
178    default:
179        av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
180    }
181}
182
183static int pix_sum_c(uint8_t * pix, int line_size)
184{
185    int s, i, j;
186
187    s = 0;
188    for (i = 0; i < 16; i++) {
189        for (j = 0; j < 16; j += 8) {
190            s += pix[0];
191            s += pix[1];
192            s += pix[2];
193            s += pix[3];
194            s += pix[4];
195            s += pix[5];
196            s += pix[6];
197            s += pix[7];
198            pix += 8;
199        }
200        pix += line_size - 16;
201    }
202    return s;
203}
204
205static int pix_norm1_c(uint8_t * pix, int line_size)
206{
207    int s, i, j;
208    uint32_t *sq = ff_squareTbl + 256;
209
210    s = 0;
211    for (i = 0; i < 16; i++) {
212        for (j = 0; j < 16; j += 8) {
213#if 0
214            s += sq[pix[0]];
215            s += sq[pix[1]];
216            s += sq[pix[2]];
217            s += sq[pix[3]];
218            s += sq[pix[4]];
219            s += sq[pix[5]];
220            s += sq[pix[6]];
221            s += sq[pix[7]];
222#else
223#if HAVE_FAST_64BIT
224            register uint64_t x=*(uint64_t*)pix;
225            s += sq[x&0xff];
226            s += sq[(x>>8)&0xff];
227            s += sq[(x>>16)&0xff];
228            s += sq[(x>>24)&0xff];
229            s += sq[(x>>32)&0xff];
230            s += sq[(x>>40)&0xff];
231            s += sq[(x>>48)&0xff];
232            s += sq[(x>>56)&0xff];
233#else
234            register uint32_t x=*(uint32_t*)pix;
235            s += sq[x&0xff];
236            s += sq[(x>>8)&0xff];
237            s += sq[(x>>16)&0xff];
238            s += sq[(x>>24)&0xff];
239            x=*(uint32_t*)(pix+4);
240            s += sq[x&0xff];
241            s += sq[(x>>8)&0xff];
242            s += sq[(x>>16)&0xff];
243            s += sq[(x>>24)&0xff];
244#endif
245#endif
246            pix += 8;
247        }
248        pix += line_size - 16;
249    }
250    return s;
251}
252
253static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
254    int i;
255
256    for(i=0; i+8<=w; i+=8){
257        dst[i+0]= av_bswap32(src[i+0]);
258        dst[i+1]= av_bswap32(src[i+1]);
259        dst[i+2]= av_bswap32(src[i+2]);
260        dst[i+3]= av_bswap32(src[i+3]);
261        dst[i+4]= av_bswap32(src[i+4]);
262        dst[i+5]= av_bswap32(src[i+5]);
263        dst[i+6]= av_bswap32(src[i+6]);
264        dst[i+7]= av_bswap32(src[i+7]);
265    }
266    for(;i<w; i++){
267        dst[i+0]= av_bswap32(src[i+0]);
268    }
269}
270
271static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
272{
273    while (len--)
274        *dst++ = av_bswap16(*src++);
275}
276
277static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
278{
279    int s, i;
280    uint32_t *sq = ff_squareTbl + 256;
281
282    s = 0;
283    for (i = 0; i < h; i++) {
284        s += sq[pix1[0] - pix2[0]];
285        s += sq[pix1[1] - pix2[1]];
286        s += sq[pix1[2] - pix2[2]];
287        s += sq[pix1[3] - pix2[3]];
288        pix1 += line_size;
289        pix2 += line_size;
290    }
291    return s;
292}
293
294static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
295{
296    int s, i;
297    uint32_t *sq = ff_squareTbl + 256;
298
299    s = 0;
300    for (i = 0; i < h; i++) {
301        s += sq[pix1[0] - pix2[0]];
302        s += sq[pix1[1] - pix2[1]];
303        s += sq[pix1[2] - pix2[2]];
304        s += sq[pix1[3] - pix2[3]];
305        s += sq[pix1[4] - pix2[4]];
306        s += sq[pix1[5] - pix2[5]];
307        s += sq[pix1[6] - pix2[6]];
308        s += sq[pix1[7] - pix2[7]];
309        pix1 += line_size;
310        pix2 += line_size;
311    }
312    return s;
313}
314
315static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
316{
317    int s, i;
318    uint32_t *sq = ff_squareTbl + 256;
319
320    s = 0;
321    for (i = 0; i < h; i++) {
322        s += sq[pix1[ 0] - pix2[ 0]];
323        s += sq[pix1[ 1] - pix2[ 1]];
324        s += sq[pix1[ 2] - pix2[ 2]];
325        s += sq[pix1[ 3] - pix2[ 3]];
326        s += sq[pix1[ 4] - pix2[ 4]];
327        s += sq[pix1[ 5] - pix2[ 5]];
328        s += sq[pix1[ 6] - pix2[ 6]];
329        s += sq[pix1[ 7] - pix2[ 7]];
330        s += sq[pix1[ 8] - pix2[ 8]];
331        s += sq[pix1[ 9] - pix2[ 9]];
332        s += sq[pix1[10] - pix2[10]];
333        s += sq[pix1[11] - pix2[11]];
334        s += sq[pix1[12] - pix2[12]];
335        s += sq[pix1[13] - pix2[13]];
336        s += sq[pix1[14] - pix2[14]];
337        s += sq[pix1[15] - pix2[15]];
338
339        pix1 += line_size;
340        pix2 += line_size;
341    }
342    return s;
343}
344
345static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
346                          const uint8_t *s2, int stride){
347    int i;
348
349    /* read the pixels */
350    for(i=0;i<8;i++) {
351        block[0] = s1[0] - s2[0];
352        block[1] = s1[1] - s2[1];
353        block[2] = s1[2] - s2[2];
354        block[3] = s1[3] - s2[3];
355        block[4] = s1[4] - s2[4];
356        block[5] = s1[5] - s2[5];
357        block[6] = s1[6] - s2[6];
358        block[7] = s1[7] - s2[7];
359        s1 += stride;
360        s2 += stride;
361        block += 8;
362    }
363}
364
365
366void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
367                             int line_size)
368{
369    int i;
370
371    /* read the pixels */
372    for(i=0;i<8;i++) {
373        pixels[0] = av_clip_uint8(block[0]);
374        pixels[1] = av_clip_uint8(block[1]);
375        pixels[2] = av_clip_uint8(block[2]);
376        pixels[3] = av_clip_uint8(block[3]);
377        pixels[4] = av_clip_uint8(block[4]);
378        pixels[5] = av_clip_uint8(block[5]);
379        pixels[6] = av_clip_uint8(block[6]);
380        pixels[7] = av_clip_uint8(block[7]);
381
382        pixels += line_size;
383        block += 8;
384    }
385}
386
387static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
388                                 int line_size)
389{
390    int i;
391
392    /* read the pixels */
393    for(i=0;i<4;i++) {
394        pixels[0] = av_clip_uint8(block[0]);
395        pixels[1] = av_clip_uint8(block[1]);
396        pixels[2] = av_clip_uint8(block[2]);
397        pixels[3] = av_clip_uint8(block[3]);
398
399        pixels += line_size;
400        block += 8;
401    }
402}
403
404static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
405                                 int line_size)
406{
407    int i;
408
409    /* read the pixels */
410    for(i=0;i<2;i++) {
411        pixels[0] = av_clip_uint8(block[0]);
412        pixels[1] = av_clip_uint8(block[1]);
413
414        pixels += line_size;
415        block += 8;
416    }
417}
418
419void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
420                                    uint8_t *restrict pixels,
421                                    int line_size)
422{
423    int i, j;
424
425    for (i = 0; i < 8; i++) {
426        for (j = 0; j < 8; j++) {
427            if (*block < -128)
428                *pixels = 0;
429            else if (*block > 127)
430                *pixels = 255;
431            else
432                *pixels = (uint8_t)(*block + 128);
433            block++;
434            pixels++;
435        }
436        pixels += (line_size - 8);
437    }
438}
439
440void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
441                             int line_size)
442{
443    int i;
444
445    /* read the pixels */
446    for(i=0;i<8;i++) {
447        pixels[0] = av_clip_uint8(pixels[0] + block[0]);
448        pixels[1] = av_clip_uint8(pixels[1] + block[1]);
449        pixels[2] = av_clip_uint8(pixels[2] + block[2]);
450        pixels[3] = av_clip_uint8(pixels[3] + block[3]);
451        pixels[4] = av_clip_uint8(pixels[4] + block[4]);
452        pixels[5] = av_clip_uint8(pixels[5] + block[5]);
453        pixels[6] = av_clip_uint8(pixels[6] + block[6]);
454        pixels[7] = av_clip_uint8(pixels[7] + block[7]);
455        pixels += line_size;
456        block += 8;
457    }
458}
459
460static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
461                          int line_size)
462{
463    int i;
464
465    /* read the pixels */
466    for(i=0;i<4;i++) {
467        pixels[0] = av_clip_uint8(pixels[0] + block[0]);
468        pixels[1] = av_clip_uint8(pixels[1] + block[1]);
469        pixels[2] = av_clip_uint8(pixels[2] + block[2]);
470        pixels[3] = av_clip_uint8(pixels[3] + block[3]);
471        pixels += line_size;
472        block += 8;
473    }
474}
475
476static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
477                          int line_size)
478{
479    int i;
480
481    /* read the pixels */
482    for(i=0;i<2;i++) {
483        pixels[0] = av_clip_uint8(pixels[0] + block[0]);
484        pixels[1] = av_clip_uint8(pixels[1] + block[1]);
485        pixels += line_size;
486        block += 8;
487    }
488}
489
490static int sum_abs_dctelem_c(DCTELEM *block)
491{
492    int sum=0, i;
493    for(i=0; i<64; i++)
494        sum+= FFABS(block[i]);
495    return sum;
496}
497
498static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
499{
500    int i;
501
502    for (i = 0; i < h; i++) {
503        memset(block, value, 16);
504        block += line_size;
505    }
506}
507
508static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
509{
510    int i;
511
512    for (i = 0; i < h; i++) {
513        memset(block, value, 8);
514        block += line_size;
515    }
516}
517
518#define avg2(a,b) ((a+b+1)>>1)
519#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
520
521static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
522{
523    const int A=(16-x16)*(16-y16);
524    const int B=(   x16)*(16-y16);
525    const int C=(16-x16)*(   y16);
526    const int D=(   x16)*(   y16);
527    int i;
528
529    for(i=0; i<h; i++)
530    {
531        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
532        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
533        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
534        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
535        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
536        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
537        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
538        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
539        dst+= stride;
540        src+= stride;
541    }
542}
543
544void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
545                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
546{
547    int y, vx, vy;
548    const int s= 1<<shift;
549
550    width--;
551    height--;
552
553    for(y=0; y<h; y++){
554        int x;
555
556        vx= ox;
557        vy= oy;
558        for(x=0; x<8; x++){ //XXX FIXME optimize
559            int src_x, src_y, frac_x, frac_y, index;
560
561            src_x= vx>>16;
562            src_y= vy>>16;
563            frac_x= src_x&(s-1);
564            frac_y= src_y&(s-1);
565            src_x>>=shift;
566            src_y>>=shift;
567
568            if((unsigned)src_x < width){
569                if((unsigned)src_y < height){
570                    index= src_x + src_y*stride;
571                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
572                                           + src[index       +1]*   frac_x )*(s-frac_y)
573                                        + (  src[index+stride  ]*(s-frac_x)
574                                           + src[index+stride+1]*   frac_x )*   frac_y
575                                        + r)>>(shift*2);
576                }else{
577                    index= src_x + av_clip(src_y, 0, height)*stride;
578                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
579                                          + src[index       +1]*   frac_x )*s
580                                        + r)>>(shift*2);
581                }
582            }else{
583                if((unsigned)src_y < height){
584                    index= av_clip(src_x, 0, width) + src_y*stride;
585                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
586                                           + src[index+stride  ]*   frac_y )*s
587                                        + r)>>(shift*2);
588                }else{
589                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
590                    dst[y*stride + x]=    src[index         ];
591                }
592            }
593
594            vx+= dxx;
595            vy+= dyx;
596        }
597        ox += dxy;
598        oy += dyy;
599    }
600}
601
602static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
603    switch(width){
604    case 2: put_pixels2_8_c (dst, src, stride, height); break;
605    case 4: put_pixels4_8_c (dst, src, stride, height); break;
606    case 8: put_pixels8_8_c (dst, src, stride, height); break;
607    case 16:put_pixels16_8_c(dst, src, stride, height); break;
608    }
609}
610
611static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
612    int i,j;
613    for (i=0; i < height; i++) {
614      for (j=0; j < width; j++) {
615        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
616      }
617      src += stride;
618      dst += stride;
619    }
620}
621
622static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
623    int i,j;
624    for (i=0; i < height; i++) {
625      for (j=0; j < width; j++) {
626        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
627      }
628      src += stride;
629      dst += stride;
630    }
631}
632
633static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
634    int i,j;
635    for (i=0; i < height; i++) {
636      for (j=0; j < width; j++) {
637        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
638      }
639      src += stride;
640      dst += stride;
641    }
642}
643
644static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
645    int i,j;
646    for (i=0; i < height; i++) {
647      for (j=0; j < width; j++) {
648        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
649      }
650      src += stride;
651      dst += stride;
652    }
653}
654
655static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
656    int i,j;
657    for (i=0; i < height; i++) {
658      for (j=0; j < width; j++) {
659        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
660      }
661      src += stride;
662      dst += stride;
663    }
664}
665
666static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
667    int i,j;
668    for (i=0; i < height; i++) {
669      for (j=0; j < width; j++) {
670        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
671      }
672      src += stride;
673      dst += stride;
674    }
675}
676
677static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
678    int i,j;
679    for (i=0; i < height; i++) {
680      for (j=0; j < width; j++) {
681        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
682      }
683      src += stride;
684      dst += stride;
685    }
686}
687
688static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
689    int i,j;
690    for (i=0; i < height; i++) {
691      for (j=0; j < width; j++) {
692        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
693      }
694      src += stride;
695      dst += stride;
696    }
697}
698
699static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
700    switch(width){
701    case 2: avg_pixels2_8_c (dst, src, stride, height); break;
702    case 4: avg_pixels4_8_c (dst, src, stride, height); break;
703    case 8: avg_pixels8_8_c (dst, src, stride, height); break;
704    case 16:avg_pixels16_8_c(dst, src, stride, height); break;
705    }
706}
707
708static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
709    int i,j;
710    for (i=0; i < height; i++) {
711      for (j=0; j < width; j++) {
712        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
713      }
714      src += stride;
715      dst += stride;
716    }
717}
718
719static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
720    int i,j;
721    for (i=0; i < height; i++) {
722      for (j=0; j < width; j++) {
723        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
724      }
725      src += stride;
726      dst += stride;
727    }
728}
729
730static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
731    int i,j;
732    for (i=0; i < height; i++) {
733      for (j=0; j < width; j++) {
734        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
735      }
736      src += stride;
737      dst += stride;
738    }
739}
740
741static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
742    int i,j;
743    for (i=0; i < height; i++) {
744      for (j=0; j < width; j++) {
745        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
746      }
747      src += stride;
748      dst += stride;
749    }
750}
751
752static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
753    int i,j;
754    for (i=0; i < height; i++) {
755      for (j=0; j < width; j++) {
756        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
757      }
758      src += stride;
759      dst += stride;
760    }
761}
762
763static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
764    int i,j;
765    for (i=0; i < height; i++) {
766      for (j=0; j < width; j++) {
767        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
768      }
769      src += stride;
770      dst += stride;
771    }
772}
773
774static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
775    int i,j;
776    for (i=0; i < height; i++) {
777      for (j=0; j < width; j++) {
778        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
779      }
780      src += stride;
781      dst += stride;
782    }
783}
784
785static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
786    int i,j;
787    for (i=0; i < height; i++) {
788      for (j=0; j < width; j++) {
789        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
790      }
791      src += stride;
792      dst += stride;
793    }
794}
795
796#define QPEL_MC(r, OPNAME, RND, OP) \
797static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
798    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
799    int i;\
800    for(i=0; i<h; i++)\
801    {\
802        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
803        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
804        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
805        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
806        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
807        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
808        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
809        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
810        dst+=dstStride;\
811        src+=srcStride;\
812    }\
813}\
814\
815static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
816    const int w=8;\
817    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
818    int i;\
819    for(i=0; i<w; i++)\
820    {\
821        const int src0= src[0*srcStride];\
822        const int src1= src[1*srcStride];\
823        const int src2= src[2*srcStride];\
824        const int src3= src[3*srcStride];\
825        const int src4= src[4*srcStride];\
826        const int src5= src[5*srcStride];\
827        const int src6= src[6*srcStride];\
828        const int src7= src[7*srcStride];\
829        const int src8= src[8*srcStride];\
830        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
831        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
832        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
833        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
834        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
835        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
836        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
837        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
838        dst++;\
839        src++;\
840    }\
841}\
842\
843static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
844    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
845    int i;\
846    \
847    for(i=0; i<h; i++)\
848    {\
849        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
850        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
851        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
852        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
853        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
854        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
855        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
856        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
857        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
858        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
859        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
860        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
861        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
862        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
863        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
864        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
865        dst+=dstStride;\
866        src+=srcStride;\
867    }\
868}\
869\
870static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
871    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
872    int i;\
873    const int w=16;\
874    for(i=0; i<w; i++)\
875    {\
876        const int src0= src[0*srcStride];\
877        const int src1= src[1*srcStride];\
878        const int src2= src[2*srcStride];\
879        const int src3= src[3*srcStride];\
880        const int src4= src[4*srcStride];\
881        const int src5= src[5*srcStride];\
882        const int src6= src[6*srcStride];\
883        const int src7= src[7*srcStride];\
884        const int src8= src[8*srcStride];\
885        const int src9= src[9*srcStride];\
886        const int src10= src[10*srcStride];\
887        const int src11= src[11*srcStride];\
888        const int src12= src[12*srcStride];\
889        const int src13= src[13*srcStride];\
890        const int src14= src[14*srcStride];\
891        const int src15= src[15*srcStride];\
892        const int src16= src[16*srcStride];\
893        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
894        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
895        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
896        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
897        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
898        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
899        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
900        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
901        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
902        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
903        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
904        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
905        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
906        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
907        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
908        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
909        dst++;\
910        src++;\
911    }\
912}\
913\
914static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
915    uint8_t half[64];\
916    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
917    OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
918}\
919\
920static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
921    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
922}\
923\
924static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
925    uint8_t half[64];\
926    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
927    OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
928}\
929\
930static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
931    uint8_t full[16*9];\
932    uint8_t half[64];\
933    copy_block9(full, src, 16, stride, 9);\
934    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
935    OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
936}\
937\
938static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
939    uint8_t full[16*9];\
940    copy_block9(full, src, 16, stride, 9);\
941    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
942}\
943\
944static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
945    uint8_t full[16*9];\
946    uint8_t half[64];\
947    copy_block9(full, src, 16, stride, 9);\
948    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
949    OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
950}\
951void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
952    uint8_t full[16*9];\
953    uint8_t halfH[72];\
954    uint8_t halfV[64];\
955    uint8_t halfHV[64];\
956    copy_block9(full, src, 16, stride, 9);\
957    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
958    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
959    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
960    OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
961}\
962static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
963    uint8_t full[16*9];\
964    uint8_t halfH[72];\
965    uint8_t halfHV[64];\
966    copy_block9(full, src, 16, stride, 9);\
967    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
968    put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
969    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
970    OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
971}\
972void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
973    uint8_t full[16*9];\
974    uint8_t halfH[72];\
975    uint8_t halfV[64];\
976    uint8_t halfHV[64];\
977    copy_block9(full, src, 16, stride, 9);\
978    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
979    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
980    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
981    OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
982}\
983static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
984    uint8_t full[16*9];\
985    uint8_t halfH[72];\
986    uint8_t halfHV[64];\
987    copy_block9(full, src, 16, stride, 9);\
988    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
989    put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
990    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
991    OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
992}\
993void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
994    uint8_t full[16*9];\
995    uint8_t halfH[72];\
996    uint8_t halfV[64];\
997    uint8_t halfHV[64];\
998    copy_block9(full, src, 16, stride, 9);\
999    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1000    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1001    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1002    OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1003}\
1004static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1005    uint8_t full[16*9];\
1006    uint8_t halfH[72];\
1007    uint8_t halfHV[64];\
1008    copy_block9(full, src, 16, stride, 9);\
1009    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1010    put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1011    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1012    OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1013}\
1014void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1015    uint8_t full[16*9];\
1016    uint8_t halfH[72];\
1017    uint8_t halfV[64];\
1018    uint8_t halfHV[64];\
1019    copy_block9(full, src, 16, stride, 9);\
1020    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1021    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1022    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1023    OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1024}\
1025static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1026    uint8_t full[16*9];\
1027    uint8_t halfH[72];\
1028    uint8_t halfHV[64];\
1029    copy_block9(full, src, 16, stride, 9);\
1030    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1031    put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1032    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1033    OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1034}\
1035static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1036    uint8_t halfH[72];\
1037    uint8_t halfHV[64];\
1038    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1039    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1040    OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1041}\
1042static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1043    uint8_t halfH[72];\
1044    uint8_t halfHV[64];\
1045    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1046    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1047    OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1048}\
1049void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1050    uint8_t full[16*9];\
1051    uint8_t halfH[72];\
1052    uint8_t halfV[64];\
1053    uint8_t halfHV[64];\
1054    copy_block9(full, src, 16, stride, 9);\
1055    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1056    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1057    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1058    OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1059}\
1060static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1061    uint8_t full[16*9];\
1062    uint8_t halfH[72];\
1063    copy_block9(full, src, 16, stride, 9);\
1064    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1065    put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1066    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1067}\
1068void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1069    uint8_t full[16*9];\
1070    uint8_t halfH[72];\
1071    uint8_t halfV[64];\
1072    uint8_t halfHV[64];\
1073    copy_block9(full, src, 16, stride, 9);\
1074    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1075    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1076    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1077    OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1078}\
1079static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1080    uint8_t full[16*9];\
1081    uint8_t halfH[72];\
1082    copy_block9(full, src, 16, stride, 9);\
1083    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1084    put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1085    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1086}\
1087static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1088    uint8_t halfH[72];\
1089    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1090    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1091}\
1092\
1093static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1094    uint8_t half[256];\
1095    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1096    OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1097}\
1098\
1099static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1100    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1101}\
1102\
1103static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1104    uint8_t half[256];\
1105    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1106    OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1107}\
1108\
1109static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1110    uint8_t full[24*17];\
1111    uint8_t half[256];\
1112    copy_block17(full, src, 24, stride, 17);\
1113    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1114    OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1115}\
1116\
1117static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1118    uint8_t full[24*17];\
1119    copy_block17(full, src, 24, stride, 17);\
1120    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1121}\
1122\
1123static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1124    uint8_t full[24*17];\
1125    uint8_t half[256];\
1126    copy_block17(full, src, 24, stride, 17);\
1127    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1128    OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1129}\
1130void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1131    uint8_t full[24*17];\
1132    uint8_t halfH[272];\
1133    uint8_t halfV[256];\
1134    uint8_t halfHV[256];\
1135    copy_block17(full, src, 24, stride, 17);\
1136    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1137    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1138    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1139    OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1140}\
1141static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1142    uint8_t full[24*17];\
1143    uint8_t halfH[272];\
1144    uint8_t halfHV[256];\
1145    copy_block17(full, src, 24, stride, 17);\
1146    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1147    put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1148    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1149    OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1150}\
1151void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1152    uint8_t full[24*17];\
1153    uint8_t halfH[272];\
1154    uint8_t halfV[256];\
1155    uint8_t halfHV[256];\
1156    copy_block17(full, src, 24, stride, 17);\
1157    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1158    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1159    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1160    OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1161}\
1162static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1163    uint8_t full[24*17];\
1164    uint8_t halfH[272];\
1165    uint8_t halfHV[256];\
1166    copy_block17(full, src, 24, stride, 17);\
1167    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1168    put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1169    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1170    OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1171}\
1172void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1173    uint8_t full[24*17];\
1174    uint8_t halfH[272];\
1175    uint8_t halfV[256];\
1176    uint8_t halfHV[256];\
1177    copy_block17(full, src, 24, stride, 17);\
1178    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1179    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1180    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1181    OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1182}\
1183static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1184    uint8_t full[24*17];\
1185    uint8_t halfH[272];\
1186    uint8_t halfHV[256];\
1187    copy_block17(full, src, 24, stride, 17);\
1188    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1189    put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1190    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1191    OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1192}\
1193void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1194    uint8_t full[24*17];\
1195    uint8_t halfH[272];\
1196    uint8_t halfV[256];\
1197    uint8_t halfHV[256];\
1198    copy_block17(full, src, 24, stride, 17);\
1199    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1200    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1201    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1202    OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1203}\
1204static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1205    uint8_t full[24*17];\
1206    uint8_t halfH[272];\
1207    uint8_t halfHV[256];\
1208    copy_block17(full, src, 24, stride, 17);\
1209    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1210    put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1211    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1212    OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1213}\
1214static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1215    uint8_t halfH[272];\
1216    uint8_t halfHV[256];\
1217    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1218    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1219    OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1220}\
1221static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1222    uint8_t halfH[272];\
1223    uint8_t halfHV[256];\
1224    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1225    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1226    OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1227}\
1228void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1229    uint8_t full[24*17];\
1230    uint8_t halfH[272];\
1231    uint8_t halfV[256];\
1232    uint8_t halfHV[256];\
1233    copy_block17(full, src, 24, stride, 17);\
1234    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1235    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1236    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1237    OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1238}\
1239static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1240    uint8_t full[24*17];\
1241    uint8_t halfH[272];\
1242    copy_block17(full, src, 24, stride, 17);\
1243    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1244    put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1245    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1246}\
1247void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1248    uint8_t full[24*17];\
1249    uint8_t halfH[272];\
1250    uint8_t halfV[256];\
1251    uint8_t halfHV[256];\
1252    copy_block17(full, src, 24, stride, 17);\
1253    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1254    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1255    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1256    OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1257}\
1258static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1259    uint8_t full[24*17];\
1260    uint8_t halfH[272];\
1261    copy_block17(full, src, 24, stride, 17);\
1262    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1263    put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1264    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1265}\
1266static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1267    uint8_t halfH[272];\
1268    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1269    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1270}
1271
1272#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1273#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1274#define op_put(a, b) a = cm[((b) + 16)>>5]
1275#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1276
1277QPEL_MC(0, put_       , _       , op_put)
1278QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1279QPEL_MC(0, avg_       , _       , op_avg)
1280//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1281#undef op_avg
1282#undef op_avg_no_rnd
1283#undef op_put
1284#undef op_put_no_rnd
1285
1286#define put_qpel8_mc00_c  ff_put_pixels8x8_c
1287#define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1288#define put_qpel16_mc00_c ff_put_pixels16x16_c
1289#define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1290#define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1291#define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1292
1293static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1294    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1295    int i;
1296
1297    for(i=0; i<h; i++){
1298        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1299        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1300        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1301        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1302        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1303        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1304        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1305        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1306        dst+=dstStride;
1307        src+=srcStride;
1308    }
1309}
1310
1311#if CONFIG_RV40_DECODER
1312void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1313    put_pixels16_xy2_8_c(dst, src, stride, 16);
1314}
1315void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1316    avg_pixels16_xy2_8_c(dst, src, stride, 16);
1317}
1318void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1319    put_pixels8_xy2_8_c(dst, src, stride, 8);
1320}
1321void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1322    avg_pixels8_xy2_8_c(dst, src, stride, 8);
1323}
1324#endif /* CONFIG_RV40_DECODER */
1325
1326static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1327    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1328    int i;
1329
1330    for(i=0; i<w; i++){
1331        const int src_1= src[ -srcStride];
1332        const int src0 = src[0          ];
1333        const int src1 = src[  srcStride];
1334        const int src2 = src[2*srcStride];
1335        const int src3 = src[3*srcStride];
1336        const int src4 = src[4*srcStride];
1337        const int src5 = src[5*srcStride];
1338        const int src6 = src[6*srcStride];
1339        const int src7 = src[7*srcStride];
1340        const int src8 = src[8*srcStride];
1341        const int src9 = src[9*srcStride];
1342        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1343        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1344        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1345        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1346        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1347        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1348        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1349        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1350        src++;
1351        dst++;
1352    }
1353}
1354
1355static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1356    uint8_t half[64];
1357    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1358    put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1359}
1360
1361static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1362    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1363}
1364
1365static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1366    uint8_t half[64];
1367    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1368    put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1369}
1370
1371static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1372    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1373}
1374
1375static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1376    uint8_t halfH[88];
1377    uint8_t halfV[64];
1378    uint8_t halfHV[64];
1379    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1380    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1381    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1382    put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1383}
1384static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1385    uint8_t halfH[88];
1386    uint8_t halfV[64];
1387    uint8_t halfHV[64];
1388    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1389    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1390    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1391    put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1392}
1393static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1394    uint8_t halfH[88];
1395    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1396    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1397}
1398
1399static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1400    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1401    int x;
1402    const int strength= ff_h263_loop_filter_strength[qscale];
1403
1404    for(x=0; x<8; x++){
1405        int d1, d2, ad1;
1406        int p0= src[x-2*stride];
1407        int p1= src[x-1*stride];
1408        int p2= src[x+0*stride];
1409        int p3= src[x+1*stride];
1410        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1411
1412        if     (d<-2*strength) d1= 0;
1413        else if(d<-  strength) d1=-2*strength - d;
1414        else if(d<   strength) d1= d;
1415        else if(d< 2*strength) d1= 2*strength - d;
1416        else                   d1= 0;
1417
1418        p1 += d1;
1419        p2 -= d1;
1420        if(p1&256) p1= ~(p1>>31);
1421        if(p2&256) p2= ~(p2>>31);
1422
1423        src[x-1*stride] = p1;
1424        src[x+0*stride] = p2;
1425
1426        ad1= FFABS(d1)>>1;
1427
1428        d2= av_clip((p0-p3)/4, -ad1, ad1);
1429
1430        src[x-2*stride] = p0 - d2;
1431        src[x+  stride] = p3 + d2;
1432    }
1433    }
1434}
1435
1436static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1437    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1438    int y;
1439    const int strength= ff_h263_loop_filter_strength[qscale];
1440
1441    for(y=0; y<8; y++){
1442        int d1, d2, ad1;
1443        int p0= src[y*stride-2];
1444        int p1= src[y*stride-1];
1445        int p2= src[y*stride+0];
1446        int p3= src[y*stride+1];
1447        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1448
1449        if     (d<-2*strength) d1= 0;
1450        else if(d<-  strength) d1=-2*strength - d;
1451        else if(d<   strength) d1= d;
1452        else if(d< 2*strength) d1= 2*strength - d;
1453        else                   d1= 0;
1454
1455        p1 += d1;
1456        p2 -= d1;
1457        if(p1&256) p1= ~(p1>>31);
1458        if(p2&256) p2= ~(p2>>31);
1459
1460        src[y*stride-1] = p1;
1461        src[y*stride+0] = p2;
1462
1463        ad1= FFABS(d1)>>1;
1464
1465        d2= av_clip((p0-p3)/4, -ad1, ad1);
1466
1467        src[y*stride-2] = p0 - d2;
1468        src[y*stride+1] = p3 + d2;
1469    }
1470    }
1471}
1472
1473static void h261_loop_filter_c(uint8_t *src, int stride){
1474    int x,y,xy,yz;
1475    int temp[64];
1476
1477    for(x=0; x<8; x++){
1478        temp[x      ] = 4*src[x           ];
1479        temp[x + 7*8] = 4*src[x + 7*stride];
1480    }
1481    for(y=1; y<7; y++){
1482        for(x=0; x<8; x++){
1483            xy = y * stride + x;
1484            yz = y * 8 + x;
1485            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1486        }
1487    }
1488
1489    for(y=0; y<8; y++){
1490        src[  y*stride] = (temp[  y*8] + 2)>>2;
1491        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1492        for(x=1; x<7; x++){
1493            xy = y * stride + x;
1494            yz = y * 8 + x;
1495            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1496        }
1497    }
1498}
1499
1500static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1501{
1502    int s, i;
1503
1504    s = 0;
1505    for(i=0;i<h;i++) {
1506        s += abs(pix1[0] - pix2[0]);
1507        s += abs(pix1[1] - pix2[1]);
1508        s += abs(pix1[2] - pix2[2]);
1509        s += abs(pix1[3] - pix2[3]);
1510        s += abs(pix1[4] - pix2[4]);
1511        s += abs(pix1[5] - pix2[5]);
1512        s += abs(pix1[6] - pix2[6]);
1513        s += abs(pix1[7] - pix2[7]);
1514        s += abs(pix1[8] - pix2[8]);
1515        s += abs(pix1[9] - pix2[9]);
1516        s += abs(pix1[10] - pix2[10]);
1517        s += abs(pix1[11] - pix2[11]);
1518        s += abs(pix1[12] - pix2[12]);
1519        s += abs(pix1[13] - pix2[13]);
1520        s += abs(pix1[14] - pix2[14]);
1521        s += abs(pix1[15] - pix2[15]);
1522        pix1 += line_size;
1523        pix2 += line_size;
1524    }
1525    return s;
1526}
1527
1528static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1529{
1530    int s, i;
1531
1532    s = 0;
1533    for(i=0;i<h;i++) {
1534        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1535        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1536        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1537        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1538        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1539        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1540        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1541        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1542        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1543        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1544        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1545        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1546        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1547        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1548        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1549        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1550        pix1 += line_size;
1551        pix2 += line_size;
1552    }
1553    return s;
1554}
1555
1556static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1557{
1558    int s, i;
1559    uint8_t *pix3 = pix2 + line_size;
1560
1561    s = 0;
1562    for(i=0;i<h;i++) {
1563        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1564        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1565        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1566        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1567        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1568        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1569        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1570        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1571        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1572        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1573        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1574        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1575        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1576        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1577        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1578        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1579        pix1 += line_size;
1580        pix2 += line_size;
1581        pix3 += line_size;
1582    }
1583    return s;
1584}
1585
1586static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1587{
1588    int s, i;
1589    uint8_t *pix3 = pix2 + line_size;
1590
1591    s = 0;
1592    for(i=0;i<h;i++) {
1593        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1594        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1595        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1596        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1597        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1598        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1599        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1600        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1601        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1602        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1603        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1604        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1605        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1606        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1607        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1608        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1609        pix1 += line_size;
1610        pix2 += line_size;
1611        pix3 += line_size;
1612    }
1613    return s;
1614}
1615
1616static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1617{
1618    int s, i;
1619
1620    s = 0;
1621    for(i=0;i<h;i++) {
1622        s += abs(pix1[0] - pix2[0]);
1623        s += abs(pix1[1] - pix2[1]);
1624        s += abs(pix1[2] - pix2[2]);
1625        s += abs(pix1[3] - pix2[3]);
1626        s += abs(pix1[4] - pix2[4]);
1627        s += abs(pix1[5] - pix2[5]);
1628        s += abs(pix1[6] - pix2[6]);
1629        s += abs(pix1[7] - pix2[7]);
1630        pix1 += line_size;
1631        pix2 += line_size;
1632    }
1633    return s;
1634}
1635
1636static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1637{
1638    int s, i;
1639
1640    s = 0;
1641    for(i=0;i<h;i++) {
1642        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1643        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1644        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1645        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1646        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1647        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1648        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1649        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1650        pix1 += line_size;
1651        pix2 += line_size;
1652    }
1653    return s;
1654}
1655
1656static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1657{
1658    int s, i;
1659    uint8_t *pix3 = pix2 + line_size;
1660
1661    s = 0;
1662    for(i=0;i<h;i++) {
1663        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1664        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1665        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1666        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1667        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1668        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1669        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1670        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1671        pix1 += line_size;
1672        pix2 += line_size;
1673        pix3 += line_size;
1674    }
1675    return s;
1676}
1677
1678static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1679{
1680    int s, i;
1681    uint8_t *pix3 = pix2 + line_size;
1682
1683    s = 0;
1684    for(i=0;i<h;i++) {
1685        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1686        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1687        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1688        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1689        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1690        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1691        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1692        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1693        pix1 += line_size;
1694        pix2 += line_size;
1695        pix3 += line_size;
1696    }
1697    return s;
1698}
1699
1700static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1701    MpegEncContext *c = v;
1702    int score1=0;
1703    int score2=0;
1704    int x,y;
1705
1706    for(y=0; y<h; y++){
1707        for(x=0; x<16; x++){
1708            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1709        }
1710        if(y+1<h){
1711            for(x=0; x<15; x++){
1712                score2+= FFABS(  s1[x  ] - s1[x  +stride]
1713                             - s1[x+1] + s1[x+1+stride])
1714                        -FFABS(  s2[x  ] - s2[x  +stride]
1715                             - s2[x+1] + s2[x+1+stride]);
1716            }
1717        }
1718        s1+= stride;
1719        s2+= stride;
1720    }
1721
1722    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1723    else  return score1 + FFABS(score2)*8;
1724}
1725
1726static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1727    MpegEncContext *c = v;
1728    int score1=0;
1729    int score2=0;
1730    int x,y;
1731
1732    for(y=0; y<h; y++){
1733        for(x=0; x<8; x++){
1734            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1735        }
1736        if(y+1<h){
1737            for(x=0; x<7; x++){
1738                score2+= FFABS(  s1[x  ] - s1[x  +stride]
1739                             - s1[x+1] + s1[x+1+stride])
1740                        -FFABS(  s2[x  ] - s2[x  +stride]
1741                             - s2[x+1] + s2[x+1+stride]);
1742            }
1743        }
1744        s1+= stride;
1745        s2+= stride;
1746    }
1747
1748    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1749    else  return score1 + FFABS(score2)*8;
1750}
1751
1752static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1753    int i;
1754    unsigned int sum=0;
1755
1756    for(i=0; i<8*8; i++){
1757        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1758        int w= weight[i];
1759        b>>= RECON_SHIFT;
1760        assert(-512<b && b<512);
1761
1762        sum += (w*b)*(w*b)>>4;
1763    }
1764    return sum>>2;
1765}
1766
1767static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1768    int i;
1769
1770    for(i=0; i<8*8; i++){
1771        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1772    }
1773}
1774
1775/**
1776 * Permute an 8x8 block.
1777 * @param block the block which will be permuted according to the given permutation vector
1778 * @param permutation the permutation vector
1779 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1780 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1781 *                  (inverse) permutated to scantable order!
1782 */
1783void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1784{
1785    int i;
1786    DCTELEM temp[64];
1787
1788    if(last<=0) return;
1789    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1790
1791    for(i=0; i<=last; i++){
1792        const int j= scantable[i];
1793        temp[j]= block[j];
1794        block[j]=0;
1795    }
1796
1797    for(i=0; i<=last; i++){
1798        const int j= scantable[i];
1799        const int perm_j= permutation[j];
1800        block[perm_j]= temp[j];
1801    }
1802}
1803
1804static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1805    return 0;
1806}
1807
1808void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1809    int i;
1810
1811    memset(cmp, 0, sizeof(void*)*6);
1812
1813    for(i=0; i<6; i++){
1814        switch(type&0xFF){
1815        case FF_CMP_SAD:
1816            cmp[i]= c->sad[i];
1817            break;
1818        case FF_CMP_SATD:
1819            cmp[i]= c->hadamard8_diff[i];
1820            break;
1821        case FF_CMP_SSE:
1822            cmp[i]= c->sse[i];
1823            break;
1824        case FF_CMP_DCT:
1825            cmp[i]= c->dct_sad[i];
1826            break;
1827        case FF_CMP_DCT264:
1828            cmp[i]= c->dct264_sad[i];
1829            break;
1830        case FF_CMP_DCTMAX:
1831            cmp[i]= c->dct_max[i];
1832            break;
1833        case FF_CMP_PSNR:
1834            cmp[i]= c->quant_psnr[i];
1835            break;
1836        case FF_CMP_BIT:
1837            cmp[i]= c->bit[i];
1838            break;
1839        case FF_CMP_RD:
1840            cmp[i]= c->rd[i];
1841            break;
1842        case FF_CMP_VSAD:
1843            cmp[i]= c->vsad[i];
1844            break;
1845        case FF_CMP_VSSE:
1846            cmp[i]= c->vsse[i];
1847            break;
1848        case FF_CMP_ZERO:
1849            cmp[i]= zero_cmp;
1850            break;
1851        case FF_CMP_NSSE:
1852            cmp[i]= c->nsse[i];
1853            break;
1854#if CONFIG_DWT
1855        case FF_CMP_W53:
1856            cmp[i]= c->w53[i];
1857            break;
1858        case FF_CMP_W97:
1859            cmp[i]= c->w97[i];
1860            break;
1861#endif
1862        default:
1863            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1864        }
1865    }
1866}
1867
1868static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1869    long i;
1870    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1871        long a = *(long*)(src+i);
1872        long b = *(long*)(dst+i);
1873        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1874    }
1875    for(; i<w; i++)
1876        dst[i+0] += src[i+0];
1877}
1878
1879static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1880    long i;
1881    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1882        long a = *(long*)(src1+i);
1883        long b = *(long*)(src2+i);
1884        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1885    }
1886    for(; i<w; i++)
1887        dst[i] = src1[i]+src2[i];
1888}
1889
1890static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1891    long i;
1892#if !HAVE_FAST_UNALIGNED
1893    if((long)src2 & (sizeof(long)-1)){
1894        for(i=0; i+7<w; i+=8){
1895            dst[i+0] = src1[i+0]-src2[i+0];
1896            dst[i+1] = src1[i+1]-src2[i+1];
1897            dst[i+2] = src1[i+2]-src2[i+2];
1898            dst[i+3] = src1[i+3]-src2[i+3];
1899            dst[i+4] = src1[i+4]-src2[i+4];
1900            dst[i+5] = src1[i+5]-src2[i+5];
1901            dst[i+6] = src1[i+6]-src2[i+6];
1902            dst[i+7] = src1[i+7]-src2[i+7];
1903        }
1904    }else
1905#endif
1906    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1907        long a = *(long*)(src1+i);
1908        long b = *(long*)(src2+i);
1909        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1910    }
1911    for(; i<w; i++)
1912        dst[i+0] = src1[i+0]-src2[i+0];
1913}
1914
1915static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1916    int i;
1917    uint8_t l, lt;
1918
1919    l= *left;
1920    lt= *left_top;
1921
1922    for(i=0; i<w; i++){
1923        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1924        lt= src1[i];
1925        dst[i]= l;
1926    }
1927
1928    *left= l;
1929    *left_top= lt;
1930}
1931
1932static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1933    int i;
1934    uint8_t l, lt;
1935
1936    l= *left;
1937    lt= *left_top;
1938
1939    for(i=0; i<w; i++){
1940        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1941        lt= src1[i];
1942        l= src2[i];
1943        dst[i]= l - pred;
1944    }
1945
1946    *left= l;
1947    *left_top= lt;
1948}
1949
1950static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1951    int i;
1952
1953    for(i=0; i<w-1; i++){
1954        acc+= src[i];
1955        dst[i]= acc;
1956        i++;
1957        acc+= src[i];
1958        dst[i]= acc;
1959    }
1960
1961    for(; i<w; i++){
1962        acc+= src[i];
1963        dst[i]= acc;
1964    }
1965
1966    return acc;
1967}
1968
1969#if HAVE_BIGENDIAN
1970#define B 3
1971#define G 2
1972#define R 1
1973#define A 0
1974#else
1975#define B 0
1976#define G 1
1977#define R 2
1978#define A 3
1979#endif
1980static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1981    int i;
1982    int r,g,b,a;
1983    r= *red;
1984    g= *green;
1985    b= *blue;
1986    a= *alpha;
1987
1988    for(i=0; i<w; i++){
1989        b+= src[4*i+B];
1990        g+= src[4*i+G];
1991        r+= src[4*i+R];
1992        a+= src[4*i+A];
1993
1994        dst[4*i+B]= b;
1995        dst[4*i+G]= g;
1996        dst[4*i+R]= r;
1997        dst[4*i+A]= a;
1998    }
1999
2000    *red= r;
2001    *green= g;
2002    *blue= b;
2003    *alpha= a;
2004}
2005#undef B
2006#undef G
2007#undef R
2008#undef A
2009
2010#define BUTTERFLY2(o1,o2,i1,i2) \
2011o1= (i1)+(i2);\
2012o2= (i1)-(i2);
2013
2014#define BUTTERFLY1(x,y) \
2015{\
2016    int a,b;\
2017    a= x;\
2018    b= y;\
2019    x= a+b;\
2020    y= a-b;\
2021}
2022
2023#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2024
2025static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2026    int i;
2027    int temp[64];
2028    int sum=0;
2029
2030    assert(h==8);
2031
2032    for(i=0; i<8; i++){
2033        //FIXME try pointer walks
2034        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2035        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2036        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2037        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2038
2039        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2040        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2041        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2042        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2043
2044        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2045        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2046        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2047        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2048    }
2049
2050    for(i=0; i<8; i++){
2051        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2052        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2053        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2054        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2055
2056        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2057        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2058        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2059        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2060
2061        sum +=
2062             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2063            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2064            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2065            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2066    }
2067    return sum;
2068}
2069
2070static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2071    int i;
2072    int temp[64];
2073    int sum=0;
2074
2075    assert(h==8);
2076
2077    for(i=0; i<8; i++){
2078        //FIXME try pointer walks
2079        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2080        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2081        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2082        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2083
2084        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2085        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2086        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2087        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2088
2089        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2090        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2091        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2092        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2093    }
2094
2095    for(i=0; i<8; i++){
2096        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2097        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2098        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2099        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2100
2101        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2102        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2103        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2104        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2105
2106        sum +=
2107             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2108            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2109            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2110            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2111    }
2112
2113    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2114
2115    return sum;
2116}
2117
2118static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2119    MpegEncContext * const s= (MpegEncContext *)c;
2120    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2121
2122    assert(h==8);
2123
2124    s->dsp.diff_pixels(temp, src1, src2, stride);
2125    s->dsp.fdct(temp);
2126    return s->dsp.sum_abs_dctelem(temp);
2127}
2128
2129#if CONFIG_GPL
2130#define DCT8_1D {\
2131    const int s07 = SRC(0) + SRC(7);\
2132    const int s16 = SRC(1) + SRC(6);\
2133    const int s25 = SRC(2) + SRC(5);\
2134    const int s34 = SRC(3) + SRC(4);\
2135    const int a0 = s07 + s34;\
2136    const int a1 = s16 + s25;\
2137    const int a2 = s07 - s34;\
2138    const int a3 = s16 - s25;\
2139    const int d07 = SRC(0) - SRC(7);\
2140    const int d16 = SRC(1) - SRC(6);\
2141    const int d25 = SRC(2) - SRC(5);\
2142    const int d34 = SRC(3) - SRC(4);\
2143    const int a4 = d16 + d25 + (d07 + (d07>>1));\
2144    const int a5 = d07 - d34 - (d25 + (d25>>1));\
2145    const int a6 = d07 + d34 - (d16 + (d16>>1));\
2146    const int a7 = d16 - d25 + (d34 + (d34>>1));\
2147    DST(0,  a0 + a1     ) ;\
2148    DST(1,  a4 + (a7>>2)) ;\
2149    DST(2,  a2 + (a3>>1)) ;\
2150    DST(3,  a5 + (a6>>2)) ;\
2151    DST(4,  a0 - a1     ) ;\
2152    DST(5,  a6 - (a5>>2)) ;\
2153    DST(6, (a2>>1) - a3 ) ;\
2154    DST(7, (a4>>2) - a7 ) ;\
2155}
2156
2157static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2158    MpegEncContext * const s= (MpegEncContext *)c;
2159    DCTELEM dct[8][8];
2160    int i;
2161    int sum=0;
2162
2163    s->dsp.diff_pixels(dct[0], src1, src2, stride);
2164
2165#define SRC(x) dct[i][x]
2166#define DST(x,v) dct[i][x]= v
2167    for( i = 0; i < 8; i++ )
2168        DCT8_1D
2169#undef SRC
2170#undef DST
2171
2172#define SRC(x) dct[x][i]
2173#define DST(x,v) sum += FFABS(v)
2174    for( i = 0; i < 8; i++ )
2175        DCT8_1D
2176#undef SRC
2177#undef DST
2178    return sum;
2179}
2180#endif
2181
2182static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2183    MpegEncContext * const s= (MpegEncContext *)c;
2184    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2185    int sum=0, i;
2186
2187    assert(h==8);
2188
2189    s->dsp.diff_pixels(temp, src1, src2, stride);
2190    s->dsp.fdct(temp);
2191
2192    for(i=0; i<64; i++)
2193        sum= FFMAX(sum, FFABS(temp[i]));
2194
2195    return sum;
2196}
2197
2198static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2199    MpegEncContext * const s= (MpegEncContext *)c;
2200    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2201    DCTELEM * const bak = temp+64;
2202    int sum=0, i;
2203
2204    assert(h==8);
2205    s->mb_intra=0;
2206
2207    s->dsp.diff_pixels(temp, src1, src2, stride);
2208
2209    memcpy(bak, temp, 64*sizeof(DCTELEM));
2210
2211    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2212    s->dct_unquantize_inter(s, temp, 0, s->qscale);
2213    ff_simple_idct_8(temp); //FIXME
2214
2215    for(i=0; i<64; i++)
2216        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2217
2218    return sum;
2219}
2220
2221static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2222    MpegEncContext * const s= (MpegEncContext *)c;
2223    const uint8_t *scantable= s->intra_scantable.permutated;
2224    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2225    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2226    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2227    int i, last, run, bits, level, distortion, start_i;
2228    const int esc_length= s->ac_esc_length;
2229    uint8_t * length;
2230    uint8_t * last_length;
2231
2232    assert(h==8);
2233
2234    copy_block8(lsrc1, src1, 8, stride, 8);
2235    copy_block8(lsrc2, src2, 8, stride, 8);
2236
2237    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2238
2239    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2240
2241    bits=0;
2242
2243    if (s->mb_intra) {
2244        start_i = 1;
2245        length     = s->intra_ac_vlc_length;
2246        last_length= s->intra_ac_vlc_last_length;
2247        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2248    } else {
2249        start_i = 0;
2250        length     = s->inter_ac_vlc_length;
2251        last_length= s->inter_ac_vlc_last_length;
2252    }
2253
2254    if(last>=start_i){
2255        run=0;
2256        for(i=start_i; i<last; i++){
2257            int j= scantable[i];
2258            level= temp[j];
2259
2260            if(level){
2261                level+=64;
2262                if((level&(~127)) == 0){
2263                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2264                }else
2265                    bits+= esc_length;
2266                run=0;
2267            }else
2268                run++;
2269        }
2270        i= scantable[last];
2271
2272        level= temp[i] + 64;
2273
2274        assert(level - 64);
2275
2276        if((level&(~127)) == 0){
2277            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2278        }else
2279            bits+= esc_length;
2280
2281    }
2282
2283    if(last>=0){
2284        if(s->mb_intra)
2285            s->dct_unquantize_intra(s, temp, 0, s->qscale);
2286        else
2287            s->dct_unquantize_inter(s, temp, 0, s->qscale);
2288    }
2289
2290    s->dsp.idct_add(lsrc2, 8, temp);
2291
2292    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2293
2294    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2295}
2296
2297static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2298    MpegEncContext * const s= (MpegEncContext *)c;
2299    const uint8_t *scantable= s->intra_scantable.permutated;
2300    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2301    int i, last, run, bits, level, start_i;
2302    const int esc_length= s->ac_esc_length;
2303    uint8_t * length;
2304    uint8_t * last_length;
2305
2306    assert(h==8);
2307
2308    s->dsp.diff_pixels(temp, src1, src2, stride);
2309
2310    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2311
2312    bits=0;
2313
2314    if (s->mb_intra) {
2315        start_i = 1;
2316        length     = s->intra_ac_vlc_length;
2317        last_length= s->intra_ac_vlc_last_length;
2318        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2319    } else {
2320        start_i = 0;
2321        length     = s->inter_ac_vlc_length;
2322        last_length= s->inter_ac_vlc_last_length;
2323    }
2324
2325    if(last>=start_i){
2326        run=0;
2327        for(i=start_i; i<last; i++){
2328            int j= scantable[i];
2329            level= temp[j];
2330
2331            if(level){
2332                level+=64;
2333                if((level&(~127)) == 0){
2334                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2335                }else
2336                    bits+= esc_length;
2337                run=0;
2338            }else
2339                run++;
2340        }
2341        i= scantable[last];
2342
2343        level= temp[i] + 64;
2344
2345        assert(level - 64);
2346
2347        if((level&(~127)) == 0){
2348            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2349        }else
2350            bits+= esc_length;
2351    }
2352
2353    return bits;
2354}
2355
2356#define VSAD_INTRA(size) \
2357static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2358    int score=0;                                                                                            \
2359    int x,y;                                                                                                \
2360                                                                                                            \
2361    for(y=1; y<h; y++){                                                                                     \
2362        for(x=0; x<size; x+=4){                                                                             \
2363            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2364                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2365        }                                                                                                   \
2366        s+= stride;                                                                                         \
2367    }                                                                                                       \
2368                                                                                                            \
2369    return score;                                                                                           \
2370}
2371VSAD_INTRA(8)
2372VSAD_INTRA(16)
2373
2374static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2375    int score=0;
2376    int x,y;
2377
2378    for(y=1; y<h; y++){
2379        for(x=0; x<16; x++){
2380            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2381        }
2382        s1+= stride;
2383        s2+= stride;
2384    }
2385
2386    return score;
2387}
2388
2389#define SQ(a) ((a)*(a))
2390#define VSSE_INTRA(size) \
2391static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2392    int score=0;                                                                                            \
2393    int x,y;                                                                                                \
2394                                                                                                            \
2395    for(y=1; y<h; y++){                                                                                     \
2396        for(x=0; x<size; x+=4){                                                                               \
2397            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2398                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2399        }                                                                                                   \
2400        s+= stride;                                                                                         \
2401    }                                                                                                       \
2402                                                                                                            \
2403    return score;                                                                                           \
2404}
2405VSSE_INTRA(8)
2406VSSE_INTRA(16)
2407
2408static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2409    int score=0;
2410    int x,y;
2411
2412    for(y=1; y<h; y++){
2413        for(x=0; x<16; x++){
2414            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2415        }
2416        s1+= stride;
2417        s2+= stride;
2418    }
2419
2420    return score;
2421}
2422
2423static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2424                               int size){
2425    int score=0;
2426    int i;
2427    for(i=0; i<size; i++)
2428        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2429    return score;
2430}
2431
2432WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2433WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2434WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2435#if CONFIG_GPL
2436WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2437#endif
2438WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2439WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2440WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2441WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2442
2443static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2444    int i;
2445    for(i=0; i<len; i++)
2446        dst[i] = src0[i] * src1[i];
2447}
2448
2449static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2450    int i;
2451    src1 += len-1;
2452    for(i=0; i<len; i++)
2453        dst[i] = src0[i] * src1[-i];
2454}
2455
2456static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2457    int i;
2458    for(i=0; i<len; i++)
2459        dst[i] = src0[i] * src1[i] + src2[i];
2460}
2461
2462static void vector_fmul_window_c(float *dst, const float *src0,
2463                                 const float *src1, const float *win, int len)
2464{
2465    int i,j;
2466    dst += len;
2467    win += len;
2468    src0+= len;
2469    for(i=-len, j=len-1; i<0; i++, j--) {
2470        float s0 = src0[i];
2471        float s1 = src1[j];
2472        float wi = win[i];
2473        float wj = win[j];
2474        dst[i] = s0*wj - s1*wi;
2475        dst[j] = s0*wi + s1*wj;
2476    }
2477}
2478
2479static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2480                                 int len)
2481{
2482    int i;
2483    for (i = 0; i < len; i++)
2484        dst[i] = src[i] * mul;
2485}
2486
2487static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
2488                                 int len)
2489{
2490    int i;
2491    for (i = 0; i < len; i++)
2492        dst[i] += src[i] * mul;
2493}
2494
2495static void butterflies_float_c(float *restrict v1, float *restrict v2,
2496                                int len)
2497{
2498    int i;
2499    for (i = 0; i < len; i++) {
2500        float t = v1[i] - v2[i];
2501        v1[i] += v2[i];
2502        v2[i] = t;
2503    }
2504}
2505
2506static void butterflies_float_interleave_c(float *dst, const float *src0,
2507                                           const float *src1, int len)
2508{
2509    int i;
2510    for (i = 0; i < len; i++) {
2511        float f1 = src0[i];
2512        float f2 = src1[i];
2513        dst[2*i    ] = f1 + f2;
2514        dst[2*i + 1] = f1 - f2;
2515    }
2516}
2517
2518static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2519{
2520    float p = 0.0;
2521    int i;
2522
2523    for (i = 0; i < len; i++)
2524        p += v1[i] * v2[i];
2525
2526    return p;
2527}
2528
2529static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2530                   uint32_t maxi, uint32_t maxisign)
2531{
2532
2533    if(a > mini) return mini;
2534    else if((a^(1U<<31)) > maxisign) return maxi;
2535    else return a;
2536}
2537
2538static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2539    int i;
2540    uint32_t mini = *(uint32_t*)min;
2541    uint32_t maxi = *(uint32_t*)max;
2542    uint32_t maxisign = maxi ^ (1U<<31);
2543    uint32_t *dsti = (uint32_t*)dst;
2544    const uint32_t *srci = (const uint32_t*)src;
2545    for(i=0; i<len; i+=8) {
2546        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2547        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2548        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2549        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2550        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2551        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2552        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2553        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2554    }
2555}
2556static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2557    int i;
2558    if(min < 0 && max > 0) {
2559        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2560    } else {
2561        for(i=0; i < len; i+=8) {
2562            dst[i    ] = av_clipf(src[i    ], min, max);
2563            dst[i + 1] = av_clipf(src[i + 1], min, max);
2564            dst[i + 2] = av_clipf(src[i + 2], min, max);
2565            dst[i + 3] = av_clipf(src[i + 3], min, max);
2566            dst[i + 4] = av_clipf(src[i + 4], min, max);
2567            dst[i + 5] = av_clipf(src[i + 5], min, max);
2568            dst[i + 6] = av_clipf(src[i + 6], min, max);
2569            dst[i + 7] = av_clipf(src[i + 7], min, max);
2570        }
2571    }
2572}
2573
2574static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2575{
2576    int res = 0;
2577
2578    while (order--)
2579        res += (*v1++ * *v2++) >> shift;
2580
2581    return res;
2582}
2583
2584static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2585{
2586    int res = 0;
2587    while (order--) {
2588        res   += *v1 * *v2++;
2589        *v1++ += mul * *v3++;
2590    }
2591    return res;
2592}
2593
2594static void apply_window_int16_c(int16_t *output, const int16_t *input,
2595                                 const int16_t *window, unsigned int len)
2596{
2597    int i;
2598    int len2 = len >> 1;
2599
2600    for (i = 0; i < len2; i++) {
2601        int16_t w       = window[i];
2602        output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2603        output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2604    }
2605}
2606
2607static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2608                                int32_t max, unsigned int len)
2609{
2610    do {
2611        *dst++ = av_clip(*src++, min, max);
2612        *dst++ = av_clip(*src++, min, max);
2613        *dst++ = av_clip(*src++, min, max);
2614        *dst++ = av_clip(*src++, min, max);
2615        *dst++ = av_clip(*src++, min, max);
2616        *dst++ = av_clip(*src++, min, max);
2617        *dst++ = av_clip(*src++, min, max);
2618        *dst++ = av_clip(*src++, min, max);
2619        len -= 8;
2620    } while (len > 0);
2621}
2622
2623#define W0 2048
2624#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2625#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2626#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2627#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2628#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2629#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2630#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2631
2632static void wmv2_idct_row(short * b)
2633{
2634    int s1,s2;
2635    int a0,a1,a2,a3,a4,a5,a6,a7;
2636    /*step 1*/
2637    a1 = W1*b[1]+W7*b[7];
2638    a7 = W7*b[1]-W1*b[7];
2639    a5 = W5*b[5]+W3*b[3];
2640    a3 = W3*b[5]-W5*b[3];
2641    a2 = W2*b[2]+W6*b[6];
2642    a6 = W6*b[2]-W2*b[6];
2643    a0 = W0*b[0]+W0*b[4];
2644    a4 = W0*b[0]-W0*b[4];
2645    /*step 2*/
2646    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2647    s2 = (181*(a1-a5-a7+a3)+128)>>8;
2648    /*step 3*/
2649    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2650    b[1] = (a4+a6 +s1   + (1<<7))>>8;
2651    b[2] = (a4-a6 +s2   + (1<<7))>>8;
2652    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2653    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2654    b[5] = (a4-a6 -s2   + (1<<7))>>8;
2655    b[6] = (a4+a6 -s1   + (1<<7))>>8;
2656    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2657}
2658static void wmv2_idct_col(short * b)
2659{
2660    int s1,s2;
2661    int a0,a1,a2,a3,a4,a5,a6,a7;
2662    /*step 1, with extended precision*/
2663    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2664    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2665    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2666    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2667    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2668    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2669    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2670    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2671    /*step 2*/
2672    s1 = (181*(a1-a5+a7-a3)+128)>>8;
2673    s2 = (181*(a1-a5-a7+a3)+128)>>8;
2674    /*step 3*/
2675    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2676    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2677    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2678    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2679
2680    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2681    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2682    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2683    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2684}
2685void ff_wmv2_idct_c(short * block){
2686    int i;
2687
2688    for(i=0;i<64;i+=8){
2689        wmv2_idct_row(block+i);
2690    }
2691    for(i=0;i<8;i++){
2692        wmv2_idct_col(block+i);
2693    }
2694}
2695/* XXX: those functions should be suppressed ASAP when all IDCTs are
2696 converted */
2697static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2698{
2699    ff_wmv2_idct_c(block);
2700    ff_put_pixels_clamped_c(block, dest, line_size);
2701}
2702static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2703{
2704    ff_wmv2_idct_c(block);
2705    ff_add_pixels_clamped_c(block, dest, line_size);
2706}
2707static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2708{
2709    j_rev_dct (block);
2710    ff_put_pixels_clamped_c(block, dest, line_size);
2711}
2712static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2713{
2714    j_rev_dct (block);
2715    ff_add_pixels_clamped_c(block, dest, line_size);
2716}
2717
2718static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2719{
2720    j_rev_dct4 (block);
2721    put_pixels_clamped4_c(block, dest, line_size);
2722}
2723static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2724{
2725    j_rev_dct4 (block);
2726    add_pixels_clamped4_c(block, dest, line_size);
2727}
2728
2729static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2730{
2731    j_rev_dct2 (block);
2732    put_pixels_clamped2_c(block, dest, line_size);
2733}
2734static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2735{
2736    j_rev_dct2 (block);
2737    add_pixels_clamped2_c(block, dest, line_size);
2738}
2739
2740static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2741{
2742    dest[0] = av_clip_uint8((block[0] + 4)>>3);
2743}
2744static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2745{
2746    dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2747}
2748
2749static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2750
2751/* init static data */
2752av_cold void dsputil_static_init(void)
2753{
2754    int i;
2755
2756    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2757    for(i=0;i<MAX_NEG_CROP;i++) {
2758        ff_cropTbl[i] = 0;
2759        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2760    }
2761
2762    for(i=0;i<512;i++) {
2763        ff_squareTbl[i] = (i - 256) * (i - 256);
2764    }
2765
2766    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2767}
2768
2769int ff_check_alignment(void){
2770    static int did_fail=0;
2771    LOCAL_ALIGNED_16(int, aligned, [4]);
2772
2773    if((intptr_t)aligned & 15){
2774        if(!did_fail){
2775#if HAVE_MMX || HAVE_ALTIVEC
2776            av_log(NULL, AV_LOG_ERROR,
2777                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2778                "and may be very slow or crash. This is not a bug in libavcodec,\n"
2779                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2780                "Do not report crashes to Libav developers.\n");
2781#endif
2782            did_fail=1;
2783        }
2784        return -1;
2785    }
2786    return 0;
2787}
2788
2789av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2790{
2791    int i, j;
2792
2793    ff_check_alignment();
2794
2795#if CONFIG_ENCODERS
2796    if (avctx->bits_per_raw_sample == 10) {
2797        c->fdct    = ff_jpeg_fdct_islow_10;
2798        c->fdct248 = ff_fdct248_islow_10;
2799    } else {
2800        if(avctx->dct_algo==FF_DCT_FASTINT) {
2801            c->fdct    = fdct_ifast;
2802            c->fdct248 = fdct_ifast248;
2803        }
2804        else if(avctx->dct_algo==FF_DCT_FAAN) {
2805            c->fdct    = ff_faandct;
2806            c->fdct248 = ff_faandct248;
2807        }
2808        else {
2809            c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2810            c->fdct248 = ff_fdct248_islow_8;
2811        }
2812    }
2813#endif //CONFIG_ENCODERS
2814
2815    if(avctx->lowres==1){
2816        c->idct_put= ff_jref_idct4_put;
2817        c->idct_add= ff_jref_idct4_add;
2818        c->idct    = j_rev_dct4;
2819        c->idct_permutation_type= FF_NO_IDCT_PERM;
2820    }else if(avctx->lowres==2){
2821        c->idct_put= ff_jref_idct2_put;
2822        c->idct_add= ff_jref_idct2_add;
2823        c->idct    = j_rev_dct2;
2824        c->idct_permutation_type= FF_NO_IDCT_PERM;
2825    }else if(avctx->lowres==3){
2826        c->idct_put= ff_jref_idct1_put;
2827        c->idct_add= ff_jref_idct1_add;
2828        c->idct    = j_rev_dct1;
2829        c->idct_permutation_type= FF_NO_IDCT_PERM;
2830    }else{
2831        if (avctx->bits_per_raw_sample == 10) {
2832            c->idct_put              = ff_simple_idct_put_10;
2833            c->idct_add              = ff_simple_idct_add_10;
2834            c->idct                  = ff_simple_idct_10;
2835            c->idct_permutation_type = FF_NO_IDCT_PERM;
2836        } else {
2837        if(avctx->idct_algo==FF_IDCT_INT){
2838            c->idct_put= ff_jref_idct_put;
2839            c->idct_add= ff_jref_idct_add;
2840            c->idct    = j_rev_dct;
2841            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2842        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2843                avctx->idct_algo==FF_IDCT_VP3){
2844            c->idct_put= ff_vp3_idct_put_c;
2845            c->idct_add= ff_vp3_idct_add_c;
2846            c->idct    = ff_vp3_idct_c;
2847            c->idct_permutation_type= FF_NO_IDCT_PERM;
2848        }else if(avctx->idct_algo==FF_IDCT_WMV2){
2849            c->idct_put= ff_wmv2_idct_put_c;
2850            c->idct_add= ff_wmv2_idct_add_c;
2851            c->idct    = ff_wmv2_idct_c;
2852            c->idct_permutation_type= FF_NO_IDCT_PERM;
2853        }else if(avctx->idct_algo==FF_IDCT_FAAN){
2854            c->idct_put= ff_faanidct_put;
2855            c->idct_add= ff_faanidct_add;
2856            c->idct    = ff_faanidct;
2857            c->idct_permutation_type= FF_NO_IDCT_PERM;
2858        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2859            c->idct_put= ff_ea_idct_put_c;
2860            c->idct_permutation_type= FF_NO_IDCT_PERM;
2861        }else{ //accurate/default
2862            c->idct_put = ff_simple_idct_put_8;
2863            c->idct_add = ff_simple_idct_add_8;
2864            c->idct     = ff_simple_idct_8;
2865            c->idct_permutation_type= FF_NO_IDCT_PERM;
2866        }
2867        }
2868    }
2869
2870    c->diff_pixels = diff_pixels_c;
2871    c->put_pixels_clamped = ff_put_pixels_clamped_c;
2872    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2873    c->add_pixels_clamped = ff_add_pixels_clamped_c;
2874    c->sum_abs_dctelem = sum_abs_dctelem_c;
2875    c->gmc1 = gmc1_c;
2876    c->gmc = ff_gmc_c;
2877    c->pix_sum = pix_sum_c;
2878    c->pix_norm1 = pix_norm1_c;
2879
2880    c->fill_block_tab[0] = fill_block16_c;
2881    c->fill_block_tab[1] = fill_block8_c;
2882
2883    /* TODO [0] 16  [1] 8 */
2884    c->pix_abs[0][0] = pix_abs16_c;
2885    c->pix_abs[0][1] = pix_abs16_x2_c;
2886    c->pix_abs[0][2] = pix_abs16_y2_c;
2887    c->pix_abs[0][3] = pix_abs16_xy2_c;
2888    c->pix_abs[1][0] = pix_abs8_c;
2889    c->pix_abs[1][1] = pix_abs8_x2_c;
2890    c->pix_abs[1][2] = pix_abs8_y2_c;
2891    c->pix_abs[1][3] = pix_abs8_xy2_c;
2892
2893    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2894    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2895    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2896    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2897    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2898    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2899    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2900    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2901    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2902
2903    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2904    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2905    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2906    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2907    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2908    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2909    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2910    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2911    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2912
2913#define dspfunc(PFX, IDX, NUM) \
2914    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2915    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2916    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2917    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2918    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2919    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2920    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2921    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2922    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2923    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2924    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2925    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2926    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2927    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2928    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2929    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2930
2931    dspfunc(put_qpel, 0, 16);
2932    dspfunc(put_no_rnd_qpel, 0, 16);
2933
2934    dspfunc(avg_qpel, 0, 16);
2935    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2936
2937    dspfunc(put_qpel, 1, 8);
2938    dspfunc(put_no_rnd_qpel, 1, 8);
2939
2940    dspfunc(avg_qpel, 1, 8);
2941    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2942
2943#undef dspfunc
2944
2945#if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2946    ff_mlp_init(c, avctx);
2947#endif
2948#if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2949    ff_intrax8dsp_init(c,avctx);
2950#endif
2951
2952    c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2953    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2954    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2955    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2956    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2957    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2958    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2959    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2960
2961#define SET_CMP_FUNC(name) \
2962    c->name[0]= name ## 16_c;\
2963    c->name[1]= name ## 8x8_c;
2964
2965    SET_CMP_FUNC(hadamard8_diff)
2966    c->hadamard8_diff[4]= hadamard8_intra16_c;
2967    c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2968    SET_CMP_FUNC(dct_sad)
2969    SET_CMP_FUNC(dct_max)
2970#if CONFIG_GPL
2971    SET_CMP_FUNC(dct264_sad)
2972#endif
2973    c->sad[0]= pix_abs16_c;
2974    c->sad[1]= pix_abs8_c;
2975    c->sse[0]= sse16_c;
2976    c->sse[1]= sse8_c;
2977    c->sse[2]= sse4_c;
2978    SET_CMP_FUNC(quant_psnr)
2979    SET_CMP_FUNC(rd)
2980    SET_CMP_FUNC(bit)
2981    c->vsad[0]= vsad16_c;
2982    c->vsad[4]= vsad_intra16_c;
2983    c->vsad[5]= vsad_intra8_c;
2984    c->vsse[0]= vsse16_c;
2985    c->vsse[4]= vsse_intra16_c;
2986    c->vsse[5]= vsse_intra8_c;
2987    c->nsse[0]= nsse16_c;
2988    c->nsse[1]= nsse8_c;
2989#if CONFIG_DWT
2990    ff_dsputil_init_dwt(c);
2991#endif
2992
2993    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2994
2995    c->add_bytes= add_bytes_c;
2996    c->add_bytes_l2= add_bytes_l2_c;
2997    c->diff_bytes= diff_bytes_c;
2998    c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2999    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3000    c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
3001    c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3002    c->bswap_buf= bswap_buf;
3003    c->bswap16_buf = bswap16_buf;
3004#if CONFIG_PNG_DECODER
3005    c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3006#endif
3007
3008    if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3009        c->h263_h_loop_filter= h263_h_loop_filter_c;
3010        c->h263_v_loop_filter= h263_v_loop_filter_c;
3011    }
3012
3013    if (CONFIG_VP3_DECODER) {
3014        c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3015        c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3016        c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3017    }
3018
3019    c->h261_loop_filter= h261_loop_filter_c;
3020
3021    c->try_8x8basis= try_8x8basis_c;
3022    c->add_8x8basis= add_8x8basis_c;
3023
3024#if CONFIG_VORBIS_DECODER
3025    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3026#endif
3027#if CONFIG_AC3_DECODER
3028    c->ac3_downmix = ff_ac3_downmix_c;
3029#endif
3030    c->vector_fmul = vector_fmul_c;
3031    c->vector_fmul_reverse = vector_fmul_reverse_c;
3032    c->vector_fmul_add = vector_fmul_add_c;
3033    c->vector_fmul_window = vector_fmul_window_c;
3034    c->vector_clipf = vector_clipf_c;
3035    c->scalarproduct_int16 = scalarproduct_int16_c;
3036    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3037    c->apply_window_int16 = apply_window_int16_c;
3038    c->vector_clip_int32 = vector_clip_int32_c;
3039    c->scalarproduct_float = scalarproduct_float_c;
3040    c->butterflies_float = butterflies_float_c;
3041    c->butterflies_float_interleave = butterflies_float_interleave_c;
3042    c->vector_fmul_scalar = vector_fmul_scalar_c;
3043    c->vector_fmac_scalar = vector_fmac_scalar_c;
3044
3045    c->shrink[0]= av_image_copy_plane;
3046    c->shrink[1]= ff_shrink22;
3047    c->shrink[2]= ff_shrink44;
3048    c->shrink[3]= ff_shrink88;
3049
3050    c->prefetch= just_return;
3051
3052    memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3053    memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3054
3055#undef FUNC
3056#undef FUNCC
3057#define FUNC(f, depth) f ## _ ## depth
3058#define FUNCC(f, depth) f ## _ ## depth ## _c
3059
3060#define dspfunc1(PFX, IDX, NUM, depth)\
3061    c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3062    c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3063    c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3064    c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3065
3066#define dspfunc2(PFX, IDX, NUM, depth)\
3067    c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3068    c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3069    c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3070    c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3071    c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3072    c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3073    c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3074    c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3075    c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3076    c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3077    c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3078    c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3079    c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3080    c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3081    c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3082    c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3083
3084
3085#define BIT_DEPTH_FUNCS(depth, dct)\
3086    c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
3087    c->draw_edges                    = FUNCC(draw_edges            , depth);\
3088    c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3089    c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
3090    c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
3091    c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
3092    c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
3093    c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3094    c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3095\
3096    c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3097    c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3098    c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3099    c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3100    c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3101    c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3102\
3103    dspfunc1(put       , 0, 16, depth);\
3104    dspfunc1(put       , 1,  8, depth);\
3105    dspfunc1(put       , 2,  4, depth);\
3106    dspfunc1(put       , 3,  2, depth);\
3107    dspfunc1(put_no_rnd, 0, 16, depth);\
3108    dspfunc1(put_no_rnd, 1,  8, depth);\
3109    dspfunc1(avg       , 0, 16, depth);\
3110    dspfunc1(avg       , 1,  8, depth);\
3111    dspfunc1(avg       , 2,  4, depth);\
3112    dspfunc1(avg       , 3,  2, depth);\
3113    dspfunc1(avg_no_rnd, 0, 16, depth);\
3114    dspfunc1(avg_no_rnd, 1,  8, depth);\
3115\
3116    dspfunc2(put_h264_qpel, 0, 16, depth);\
3117    dspfunc2(put_h264_qpel, 1,  8, depth);\
3118    dspfunc2(put_h264_qpel, 2,  4, depth);\
3119    dspfunc2(put_h264_qpel, 3,  2, depth);\
3120    dspfunc2(avg_h264_qpel, 0, 16, depth);\
3121    dspfunc2(avg_h264_qpel, 1,  8, depth);\
3122    dspfunc2(avg_h264_qpel, 2,  4, depth);
3123
3124    switch (avctx->bits_per_raw_sample) {
3125    case 9:
3126        if (c->dct_bits == 32) {
3127            BIT_DEPTH_FUNCS(9, _32);
3128        } else {
3129            BIT_DEPTH_FUNCS(9, _16);
3130        }
3131        break;
3132    case 10:
3133        if (c->dct_bits == 32) {
3134            BIT_DEPTH_FUNCS(10, _32);
3135        } else {
3136            BIT_DEPTH_FUNCS(10, _16);
3137        }
3138        break;
3139    default:
3140        av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3141    case 8:
3142        BIT_DEPTH_FUNCS(8, _16);
3143        break;
3144    }
3145
3146
3147    if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3148    if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3149    if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3150    if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3151    if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3152    if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3153    if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3154    if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3155    if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3156
3157    for (i = 0; i < 4; i++) {
3158        for (j = 0; j < 16; j++) {
3159            if(!c->put_2tap_qpel_pixels_tab[i][j])
3160                c->put_2tap_qpel_pixels_tab[i][j] =
3161                    c->put_h264_qpel_pixels_tab[i][j];
3162            if(!c->avg_2tap_qpel_pixels_tab[i][j])
3163                c->avg_2tap_qpel_pixels_tab[i][j] =
3164                    c->avg_h264_qpel_pixels_tab[i][j];
3165        }
3166    }
3167
3168    ff_init_scantable_permutation(c->idct_permutation,
3169                                  c->idct_permutation_type);
3170}
3171