1/*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25/**
26 * @file libavcodec/dsputil.c
27 * DSP utils
28 */
29
30#include "avcodec.h"
31#include "dsputil.h"
32#include "simple_idct.h"
33#include "faandct.h"
34#include "faanidct.h"
35#include "mathops.h"
36#include "h263.h"
37#include "snow.h"
38
39/* snow.c */
40void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41
42/* vorbis.c */
43void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
44
45/* ac3dec.c */
46void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
47
48/* flacenc.c */
49void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
50
51/* pngdec.c */
52void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
53
54/* eaidct.c */
55void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
56
57uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
58uint32_t ff_squareTbl[512] = {0, };
59
60// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
61#define pb_7f (~0UL/255 * 0x7f)
62#define pb_80 (~0UL/255 * 0x80)
63
64const uint8_t ff_zigzag_direct[64] = {
65    0,   1,  8, 16,  9,  2,  3, 10,
66    17, 24, 32, 25, 18, 11,  4,  5,
67    12, 19, 26, 33, 40, 48, 41, 34,
68    27, 20, 13,  6,  7, 14, 21, 28,
69    35, 42, 49, 56, 57, 50, 43, 36,
70    29, 22, 15, 23, 30, 37, 44, 51,
71    58, 59, 52, 45, 38, 31, 39, 46,
72    53, 60, 61, 54, 47, 55, 62, 63
73};
74
75/* Specific zigzag scan for 248 idct. NOTE that unlike the
76   specification, we interleave the fields */
77const uint8_t ff_zigzag248_direct[64] = {
78     0,  8,  1,  9, 16, 24,  2, 10,
79    17, 25, 32, 40, 48, 56, 33, 41,
80    18, 26,  3, 11,  4, 12, 19, 27,
81    34, 42, 49, 57, 50, 58, 35, 43,
82    20, 28,  5, 13,  6, 14, 21, 29,
83    36, 44, 51, 59, 52, 60, 37, 45,
84    22, 30,  7, 15, 23, 31, 38, 46,
85    53, 61, 54, 62, 39, 47, 55, 63,
86};
87
88/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
89DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
90
91const uint8_t ff_alternate_horizontal_scan[64] = {
92    0,  1,   2,  3,  8,  9, 16, 17,
93    10, 11,  4,  5,  6,  7, 15, 14,
94    13, 12, 19, 18, 24, 25, 32, 33,
95    26, 27, 20, 21, 22, 23, 28, 29,
96    30, 31, 34, 35, 40, 41, 48, 49,
97    42, 43, 36, 37, 38, 39, 44, 45,
98    46, 47, 50, 51, 56, 57, 58, 59,
99    52, 53, 54, 55, 60, 61, 62, 63,
100};
101
102const uint8_t ff_alternate_vertical_scan[64] = {
103    0,  8,  16, 24,  1,  9,  2, 10,
104    17, 25, 32, 40, 48, 56, 57, 49,
105    41, 33, 26, 18,  3, 11,  4, 12,
106    19, 27, 34, 42, 50, 58, 35, 43,
107    51, 59, 20, 28,  5, 13,  6, 14,
108    21, 29, 36, 44, 52, 60, 37, 45,
109    53, 61, 22, 30,  7, 15, 23, 31,
110    38, 46, 54, 62, 39, 47, 55, 63,
111};
112
113/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
114const uint32_t ff_inverse[256]={
115         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
116 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
117 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
118 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
119 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
120 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
121  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
122  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
123  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
124  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
125  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
126  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
127  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
128  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
129  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
130  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
131  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
132  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
133  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
134  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
135  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
136  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
137  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
138  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
139  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
140  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
141  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
142  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
143  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
144  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
145  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
146  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
147};
148
149/* Input permutation for the simple_idct_mmx */
150static const uint8_t simple_mmx_permutation[64]={
151        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
152        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
153        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
154        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
155        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
156        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
157        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
158        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
159};
160
161static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
162
163void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
164    int i;
165    int end;
166
167    st->scantable= src_scantable;
168
169    for(i=0; i<64; i++){
170        int j;
171        j = src_scantable[i];
172        st->permutated[i] = permutation[j];
173#if ARCH_PPC
174        st->inverse[j] = i;
175#endif
176    }
177
178    end=-1;
179    for(i=0; i<64; i++){
180        int j;
181        j = st->permutated[i];
182        if(j>end) end=j;
183        st->raster_end[i]= end;
184    }
185}
186
187static int pix_sum_c(uint8_t * pix, int line_size)
188{
189    int s, i, j;
190
191    s = 0;
192    for (i = 0; i < 16; i++) {
193        for (j = 0; j < 16; j += 8) {
194            s += pix[0];
195            s += pix[1];
196            s += pix[2];
197            s += pix[3];
198            s += pix[4];
199            s += pix[5];
200            s += pix[6];
201            s += pix[7];
202            pix += 8;
203        }
204        pix += line_size - 16;
205    }
206    return s;
207}
208
209static int pix_norm1_c(uint8_t * pix, int line_size)
210{
211    int s, i, j;
212    uint32_t *sq = ff_squareTbl + 256;
213
214    s = 0;
215    for (i = 0; i < 16; i++) {
216        for (j = 0; j < 16; j += 8) {
217#if 0
218            s += sq[pix[0]];
219            s += sq[pix[1]];
220            s += sq[pix[2]];
221            s += sq[pix[3]];
222            s += sq[pix[4]];
223            s += sq[pix[5]];
224            s += sq[pix[6]];
225            s += sq[pix[7]];
226#else
227#if LONG_MAX > 2147483647
228            register uint64_t x=*(uint64_t*)pix;
229            s += sq[x&0xff];
230            s += sq[(x>>8)&0xff];
231            s += sq[(x>>16)&0xff];
232            s += sq[(x>>24)&0xff];
233            s += sq[(x>>32)&0xff];
234            s += sq[(x>>40)&0xff];
235            s += sq[(x>>48)&0xff];
236            s += sq[(x>>56)&0xff];
237#else
238            register uint32_t x=*(uint32_t*)pix;
239            s += sq[x&0xff];
240            s += sq[(x>>8)&0xff];
241            s += sq[(x>>16)&0xff];
242            s += sq[(x>>24)&0xff];
243            x=*(uint32_t*)(pix+4);
244            s += sq[x&0xff];
245            s += sq[(x>>8)&0xff];
246            s += sq[(x>>16)&0xff];
247            s += sq[(x>>24)&0xff];
248#endif
249#endif
250            pix += 8;
251        }
252        pix += line_size - 16;
253    }
254    return s;
255}
256
257static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
258    int i;
259
260    for(i=0; i+8<=w; i+=8){
261        dst[i+0]= bswap_32(src[i+0]);
262        dst[i+1]= bswap_32(src[i+1]);
263        dst[i+2]= bswap_32(src[i+2]);
264        dst[i+3]= bswap_32(src[i+3]);
265        dst[i+4]= bswap_32(src[i+4]);
266        dst[i+5]= bswap_32(src[i+5]);
267        dst[i+6]= bswap_32(src[i+6]);
268        dst[i+7]= bswap_32(src[i+7]);
269    }
270    for(;i<w; i++){
271        dst[i+0]= bswap_32(src[i+0]);
272    }
273}
274
275static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
276{
277    int s, i;
278    uint32_t *sq = ff_squareTbl + 256;
279
280    s = 0;
281    for (i = 0; i < h; i++) {
282        s += sq[pix1[0] - pix2[0]];
283        s += sq[pix1[1] - pix2[1]];
284        s += sq[pix1[2] - pix2[2]];
285        s += sq[pix1[3] - pix2[3]];
286        pix1 += line_size;
287        pix2 += line_size;
288    }
289    return s;
290}
291
292static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
293{
294    int s, i;
295    uint32_t *sq = ff_squareTbl + 256;
296
297    s = 0;
298    for (i = 0; i < h; i++) {
299        s += sq[pix1[0] - pix2[0]];
300        s += sq[pix1[1] - pix2[1]];
301        s += sq[pix1[2] - pix2[2]];
302        s += sq[pix1[3] - pix2[3]];
303        s += sq[pix1[4] - pix2[4]];
304        s += sq[pix1[5] - pix2[5]];
305        s += sq[pix1[6] - pix2[6]];
306        s += sq[pix1[7] - pix2[7]];
307        pix1 += line_size;
308        pix2 += line_size;
309    }
310    return s;
311}
312
313static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
314{
315    int s, i;
316    uint32_t *sq = ff_squareTbl + 256;
317
318    s = 0;
319    for (i = 0; i < h; i++) {
320        s += sq[pix1[ 0] - pix2[ 0]];
321        s += sq[pix1[ 1] - pix2[ 1]];
322        s += sq[pix1[ 2] - pix2[ 2]];
323        s += sq[pix1[ 3] - pix2[ 3]];
324        s += sq[pix1[ 4] - pix2[ 4]];
325        s += sq[pix1[ 5] - pix2[ 5]];
326        s += sq[pix1[ 6] - pix2[ 6]];
327        s += sq[pix1[ 7] - pix2[ 7]];
328        s += sq[pix1[ 8] - pix2[ 8]];
329        s += sq[pix1[ 9] - pix2[ 9]];
330        s += sq[pix1[10] - pix2[10]];
331        s += sq[pix1[11] - pix2[11]];
332        s += sq[pix1[12] - pix2[12]];
333        s += sq[pix1[13] - pix2[13]];
334        s += sq[pix1[14] - pix2[14]];
335        s += sq[pix1[15] - pix2[15]];
336
337        pix1 += line_size;
338        pix2 += line_size;
339    }
340    return s;
341}
342
343
344#if CONFIG_SNOW_ENCODER //dwt is in snow.c
345static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
346    int s, i, j;
347    const int dec_count= w==8 ? 3 : 4;
348    int tmp[32*32];
349    int level, ori;
350    static const int scale[2][2][4][4]={
351      {
352        {
353            // 9/7 8x8 dec=3
354            {268, 239, 239, 213},
355            {  0, 224, 224, 152},
356            {  0, 135, 135, 110},
357        },{
358            // 9/7 16x16 or 32x32 dec=4
359            {344, 310, 310, 280},
360            {  0, 320, 320, 228},
361            {  0, 175, 175, 136},
362            {  0, 129, 129, 102},
363        }
364      },{
365        {
366            // 5/3 8x8 dec=3
367            {275, 245, 245, 218},
368            {  0, 230, 230, 156},
369            {  0, 138, 138, 113},
370        },{
371            // 5/3 16x16 or 32x32 dec=4
372            {352, 317, 317, 286},
373            {  0, 328, 328, 233},
374            {  0, 180, 180, 140},
375            {  0, 132, 132, 105},
376        }
377      }
378    };
379
380    for (i = 0; i < h; i++) {
381        for (j = 0; j < w; j+=4) {
382            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
383            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
384            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
385            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
386        }
387        pix1 += line_size;
388        pix2 += line_size;
389    }
390
391    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
392
393    s=0;
394    assert(w==h);
395    for(level=0; level<dec_count; level++){
396        for(ori= level ? 1 : 0; ori<4; ori++){
397            int size= w>>(dec_count-level);
398            int sx= (ori&1) ? size : 0;
399            int stride= 32<<(dec_count-level);
400            int sy= (ori&2) ? stride>>1 : 0;
401
402            for(i=0; i<size; i++){
403                for(j=0; j<size; j++){
404                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
405                    s += FFABS(v);
406                }
407            }
408        }
409    }
410    assert(s>=0);
411    return s>>9;
412}
413
414static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
415    return w_c(v, pix1, pix2, line_size,  8, h, 1);
416}
417
418static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
419    return w_c(v, pix1, pix2, line_size,  8, h, 0);
420}
421
422static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423    return w_c(v, pix1, pix2, line_size, 16, h, 1);
424}
425
426static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427    return w_c(v, pix1, pix2, line_size, 16, h, 0);
428}
429
430int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431    return w_c(v, pix1, pix2, line_size, 32, h, 1);
432}
433
434int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
435    return w_c(v, pix1, pix2, line_size, 32, h, 0);
436}
437#endif
438
439/* draw the edges of width 'w' of an image of size width, height */
440//FIXME check that this is ok for mpeg4 interlaced
441static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
442{
443    uint8_t *ptr, *last_line;
444    int i;
445
446    last_line = buf + (height - 1) * wrap;
447    for(i=0;i<w;i++) {
448        /* top and bottom */
449        memcpy(buf - (i + 1) * wrap, buf, width);
450        memcpy(last_line + (i + 1) * wrap, last_line, width);
451    }
452    /* left and right */
453    ptr = buf;
454    for(i=0;i<height;i++) {
455        memset(ptr - w, ptr[0], w);
456        memset(ptr + width, ptr[width-1], w);
457        ptr += wrap;
458    }
459    /* corners */
460    for(i=0;i<w;i++) {
461        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
462        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
463        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
464        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
465    }
466}
467
468/**
469 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
470 * @param buf destination buffer
471 * @param src source buffer
472 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
473 * @param block_w width of block
474 * @param block_h height of block
475 * @param src_x x coordinate of the top left sample of the block in the source buffer
476 * @param src_y y coordinate of the top left sample of the block in the source buffer
477 * @param w width of the source buffer
478 * @param h height of the source buffer
479 */
480void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
481                                    int src_x, int src_y, int w, int h){
482    int x, y;
483    int start_y, start_x, end_y, end_x;
484
485    if(src_y>= h){
486        src+= (h-1-src_y)*linesize;
487        src_y=h-1;
488    }else if(src_y<=-block_h){
489        src+= (1-block_h-src_y)*linesize;
490        src_y=1-block_h;
491    }
492    if(src_x>= w){
493        src+= (w-1-src_x);
494        src_x=w-1;
495    }else if(src_x<=-block_w){
496        src+= (1-block_w-src_x);
497        src_x=1-block_w;
498    }
499
500    start_y= FFMAX(0, -src_y);
501    start_x= FFMAX(0, -src_x);
502    end_y= FFMIN(block_h, h-src_y);
503    end_x= FFMIN(block_w, w-src_x);
504
505    // copy existing part
506    for(y=start_y; y<end_y; y++){
507        for(x=start_x; x<end_x; x++){
508            buf[x + y*linesize]= src[x + y*linesize];
509        }
510    }
511
512    //top
513    for(y=0; y<start_y; y++){
514        for(x=start_x; x<end_x; x++){
515            buf[x + y*linesize]= buf[x + start_y*linesize];
516        }
517    }
518
519    //bottom
520    for(y=end_y; y<block_h; y++){
521        for(x=start_x; x<end_x; x++){
522            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
523        }
524    }
525
526    for(y=0; y<block_h; y++){
527       //left
528        for(x=0; x<start_x; x++){
529            buf[x + y*linesize]= buf[start_x + y*linesize];
530        }
531
532       //right
533        for(x=end_x; x<block_w; x++){
534            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
535        }
536    }
537}
538
539static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
540{
541    int i;
542
543    /* read the pixels */
544    for(i=0;i<8;i++) {
545        block[0] = pixels[0];
546        block[1] = pixels[1];
547        block[2] = pixels[2];
548        block[3] = pixels[3];
549        block[4] = pixels[4];
550        block[5] = pixels[5];
551        block[6] = pixels[6];
552        block[7] = pixels[7];
553        pixels += line_size;
554        block += 8;
555    }
556}
557
558static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
559                          const uint8_t *s2, int stride){
560    int i;
561
562    /* read the pixels */
563    for(i=0;i<8;i++) {
564        block[0] = s1[0] - s2[0];
565        block[1] = s1[1] - s2[1];
566        block[2] = s1[2] - s2[2];
567        block[3] = s1[3] - s2[3];
568        block[4] = s1[4] - s2[4];
569        block[5] = s1[5] - s2[5];
570        block[6] = s1[6] - s2[6];
571        block[7] = s1[7] - s2[7];
572        s1 += stride;
573        s2 += stride;
574        block += 8;
575    }
576}
577
578
579static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
580                                 int line_size)
581{
582    int i;
583    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
584
585    /* read the pixels */
586    for(i=0;i<8;i++) {
587        pixels[0] = cm[block[0]];
588        pixels[1] = cm[block[1]];
589        pixels[2] = cm[block[2]];
590        pixels[3] = cm[block[3]];
591        pixels[4] = cm[block[4]];
592        pixels[5] = cm[block[5]];
593        pixels[6] = cm[block[6]];
594        pixels[7] = cm[block[7]];
595
596        pixels += line_size;
597        block += 8;
598    }
599}
600
601static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
602                                 int line_size)
603{
604    int i;
605    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
606
607    /* read the pixels */
608    for(i=0;i<4;i++) {
609        pixels[0] = cm[block[0]];
610        pixels[1] = cm[block[1]];
611        pixels[2] = cm[block[2]];
612        pixels[3] = cm[block[3]];
613
614        pixels += line_size;
615        block += 8;
616    }
617}
618
619static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
620                                 int line_size)
621{
622    int i;
623    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
624
625    /* read the pixels */
626    for(i=0;i<2;i++) {
627        pixels[0] = cm[block[0]];
628        pixels[1] = cm[block[1]];
629
630        pixels += line_size;
631        block += 8;
632    }
633}
634
635static void put_signed_pixels_clamped_c(const DCTELEM *block,
636                                        uint8_t *restrict pixels,
637                                        int line_size)
638{
639    int i, j;
640
641    for (i = 0; i < 8; i++) {
642        for (j = 0; j < 8; j++) {
643            if (*block < -128)
644                *pixels = 0;
645            else if (*block > 127)
646                *pixels = 255;
647            else
648                *pixels = (uint8_t)(*block + 128);
649            block++;
650            pixels++;
651        }
652        pixels += (line_size - 8);
653    }
654}
655
656static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
657                          int line_size)
658{
659    int i;
660    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
661
662    /* read the pixels */
663    for(i=0;i<8;i++) {
664        pixels[0] = cm[pixels[0] + block[0]];
665        pixels[1] = cm[pixels[1] + block[1]];
666        pixels[2] = cm[pixels[2] + block[2]];
667        pixels[3] = cm[pixels[3] + block[3]];
668        pixels[4] = cm[pixels[4] + block[4]];
669        pixels[5] = cm[pixels[5] + block[5]];
670        pixels[6] = cm[pixels[6] + block[6]];
671        pixels[7] = cm[pixels[7] + block[7]];
672        pixels += line_size;
673        block += 8;
674    }
675}
676
677static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
678                          int line_size)
679{
680    int i;
681    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
682
683    /* read the pixels */
684    for(i=0;i<4;i++) {
685        pixels[0] = cm[pixels[0] + block[0]];
686        pixels[1] = cm[pixels[1] + block[1]];
687        pixels[2] = cm[pixels[2] + block[2]];
688        pixels[3] = cm[pixels[3] + block[3]];
689        pixels += line_size;
690        block += 8;
691    }
692}
693
694static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
695                          int line_size)
696{
697    int i;
698    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
699
700    /* read the pixels */
701    for(i=0;i<2;i++) {
702        pixels[0] = cm[pixels[0] + block[0]];
703        pixels[1] = cm[pixels[1] + block[1]];
704        pixels += line_size;
705        block += 8;
706    }
707}
708
709static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
710{
711    int i;
712    for(i=0;i<8;i++) {
713        pixels[0] += block[0];
714        pixels[1] += block[1];
715        pixels[2] += block[2];
716        pixels[3] += block[3];
717        pixels[4] += block[4];
718        pixels[5] += block[5];
719        pixels[6] += block[6];
720        pixels[7] += block[7];
721        pixels += line_size;
722        block += 8;
723    }
724}
725
726static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
727{
728    int i;
729    for(i=0;i<4;i++) {
730        pixels[0] += block[0];
731        pixels[1] += block[1];
732        pixels[2] += block[2];
733        pixels[3] += block[3];
734        pixels += line_size;
735        block += 4;
736    }
737}
738
739static int sum_abs_dctelem_c(DCTELEM *block)
740{
741    int sum=0, i;
742    for(i=0; i<64; i++)
743        sum+= FFABS(block[i]);
744    return sum;
745}
746
747#if 0
748
749#define PIXOP2(OPNAME, OP) \
750static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
751{\
752    int i;\
753    for(i=0; i<h; i++){\
754        OP(*((uint64_t*)block), AV_RN64(pixels));\
755        pixels+=line_size;\
756        block +=line_size;\
757    }\
758}\
759\
760static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
761{\
762    int i;\
763    for(i=0; i<h; i++){\
764        const uint64_t a= AV_RN64(pixels  );\
765        const uint64_t b= AV_RN64(pixels+1);\
766        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
767        pixels+=line_size;\
768        block +=line_size;\
769    }\
770}\
771\
772static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
773{\
774    int i;\
775    for(i=0; i<h; i++){\
776        const uint64_t a= AV_RN64(pixels  );\
777        const uint64_t b= AV_RN64(pixels+1);\
778        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
779        pixels+=line_size;\
780        block +=line_size;\
781    }\
782}\
783\
784static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
785{\
786    int i;\
787    for(i=0; i<h; i++){\
788        const uint64_t a= AV_RN64(pixels          );\
789        const uint64_t b= AV_RN64(pixels+line_size);\
790        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
791        pixels+=line_size;\
792        block +=line_size;\
793    }\
794}\
795\
796static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
797{\
798    int i;\
799    for(i=0; i<h; i++){\
800        const uint64_t a= AV_RN64(pixels          );\
801        const uint64_t b= AV_RN64(pixels+line_size);\
802        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
803        pixels+=line_size;\
804        block +=line_size;\
805    }\
806}\
807\
808static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
809{\
810        int i;\
811        const uint64_t a= AV_RN64(pixels  );\
812        const uint64_t b= AV_RN64(pixels+1);\
813        uint64_t l0=  (a&0x0303030303030303ULL)\
814                    + (b&0x0303030303030303ULL)\
815                    + 0x0202020202020202ULL;\
816        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
817                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
818        uint64_t l1,h1;\
819\
820        pixels+=line_size;\
821        for(i=0; i<h; i+=2){\
822            uint64_t a= AV_RN64(pixels  );\
823            uint64_t b= AV_RN64(pixels+1);\
824            l1=  (a&0x0303030303030303ULL)\
825               + (b&0x0303030303030303ULL);\
826            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
827              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
828            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
829            pixels+=line_size;\
830            block +=line_size;\
831            a= AV_RN64(pixels  );\
832            b= AV_RN64(pixels+1);\
833            l0=  (a&0x0303030303030303ULL)\
834               + (b&0x0303030303030303ULL)\
835               + 0x0202020202020202ULL;\
836            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
837              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
838            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
839            pixels+=line_size;\
840            block +=line_size;\
841        }\
842}\
843\
844static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
845{\
846        int i;\
847        const uint64_t a= AV_RN64(pixels  );\
848        const uint64_t b= AV_RN64(pixels+1);\
849        uint64_t l0=  (a&0x0303030303030303ULL)\
850                    + (b&0x0303030303030303ULL)\
851                    + 0x0101010101010101ULL;\
852        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
853                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
854        uint64_t l1,h1;\
855\
856        pixels+=line_size;\
857        for(i=0; i<h; i+=2){\
858            uint64_t a= AV_RN64(pixels  );\
859            uint64_t b= AV_RN64(pixels+1);\
860            l1=  (a&0x0303030303030303ULL)\
861               + (b&0x0303030303030303ULL);\
862            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
863              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
864            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
865            pixels+=line_size;\
866            block +=line_size;\
867            a= AV_RN64(pixels  );\
868            b= AV_RN64(pixels+1);\
869            l0=  (a&0x0303030303030303ULL)\
870               + (b&0x0303030303030303ULL)\
871               + 0x0101010101010101ULL;\
872            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
873              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
874            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
875            pixels+=line_size;\
876            block +=line_size;\
877        }\
878}\
879\
880CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
881CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
882CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
883CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
884CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
885CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
886CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
887
888#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
889#else // 64 bit variant
890
891#define PIXOP2(OPNAME, OP) \
892static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
893    int i;\
894    for(i=0; i<h; i++){\
895        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
896        pixels+=line_size;\
897        block +=line_size;\
898    }\
899}\
900static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
901    int i;\
902    for(i=0; i<h; i++){\
903        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
904        pixels+=line_size;\
905        block +=line_size;\
906    }\
907}\
908static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
909    int i;\
910    for(i=0; i<h; i++){\
911        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
912        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
913        pixels+=line_size;\
914        block +=line_size;\
915    }\
916}\
917static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
918    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
919}\
920\
921static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
922                                                int src_stride1, int src_stride2, int h){\
923    int i;\
924    for(i=0; i<h; i++){\
925        uint32_t a,b;\
926        a= AV_RN32(&src1[i*src_stride1  ]);\
927        b= AV_RN32(&src2[i*src_stride2  ]);\
928        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
929        a= AV_RN32(&src1[i*src_stride1+4]);\
930        b= AV_RN32(&src2[i*src_stride2+4]);\
931        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
932    }\
933}\
934\
935static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
936                                                int src_stride1, int src_stride2, int h){\
937    int i;\
938    for(i=0; i<h; i++){\
939        uint32_t a,b;\
940        a= AV_RN32(&src1[i*src_stride1  ]);\
941        b= AV_RN32(&src2[i*src_stride2  ]);\
942        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
943        a= AV_RN32(&src1[i*src_stride1+4]);\
944        b= AV_RN32(&src2[i*src_stride2+4]);\
945        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
946    }\
947}\
948\
949static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
950                                                int src_stride1, int src_stride2, int h){\
951    int i;\
952    for(i=0; i<h; i++){\
953        uint32_t a,b;\
954        a= AV_RN32(&src1[i*src_stride1  ]);\
955        b= AV_RN32(&src2[i*src_stride2  ]);\
956        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
957    }\
958}\
959\
960static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
961                                                int src_stride1, int src_stride2, int h){\
962    int i;\
963    for(i=0; i<h; i++){\
964        uint32_t a,b;\
965        a= AV_RN16(&src1[i*src_stride1  ]);\
966        b= AV_RN16(&src2[i*src_stride2  ]);\
967        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
968    }\
969}\
970\
971static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
972                                                int src_stride1, int src_stride2, int h){\
973    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
974    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
975}\
976\
977static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
978                                                int src_stride1, int src_stride2, int h){\
979    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
980    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
981}\
982\
983static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
984    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
985}\
986\
987static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
988    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
989}\
990\
991static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
992    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
993}\
994\
995static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
996    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
997}\
998\
999static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1000                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1001    int i;\
1002    for(i=0; i<h; i++){\
1003        uint32_t a, b, c, d, l0, l1, h0, h1;\
1004        a= AV_RN32(&src1[i*src_stride1]);\
1005        b= AV_RN32(&src2[i*src_stride2]);\
1006        c= AV_RN32(&src3[i*src_stride3]);\
1007        d= AV_RN32(&src4[i*src_stride4]);\
1008        l0=  (a&0x03030303UL)\
1009           + (b&0x03030303UL)\
1010           + 0x02020202UL;\
1011        h0= ((a&0xFCFCFCFCUL)>>2)\
1012          + ((b&0xFCFCFCFCUL)>>2);\
1013        l1=  (c&0x03030303UL)\
1014           + (d&0x03030303UL);\
1015        h1= ((c&0xFCFCFCFCUL)>>2)\
1016          + ((d&0xFCFCFCFCUL)>>2);\
1017        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018        a= AV_RN32(&src1[i*src_stride1+4]);\
1019        b= AV_RN32(&src2[i*src_stride2+4]);\
1020        c= AV_RN32(&src3[i*src_stride3+4]);\
1021        d= AV_RN32(&src4[i*src_stride4+4]);\
1022        l0=  (a&0x03030303UL)\
1023           + (b&0x03030303UL)\
1024           + 0x02020202UL;\
1025        h0= ((a&0xFCFCFCFCUL)>>2)\
1026          + ((b&0xFCFCFCFCUL)>>2);\
1027        l1=  (c&0x03030303UL)\
1028           + (d&0x03030303UL);\
1029        h1= ((c&0xFCFCFCFCUL)>>2)\
1030          + ((d&0xFCFCFCFCUL)>>2);\
1031        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1032    }\
1033}\
1034\
1035static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1036    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1037}\
1038\
1039static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1041}\
1042\
1043static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1045}\
1046\
1047static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1048    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1049}\
1050\
1051static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1052                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1053    int i;\
1054    for(i=0; i<h; i++){\
1055        uint32_t a, b, c, d, l0, l1, h0, h1;\
1056        a= AV_RN32(&src1[i*src_stride1]);\
1057        b= AV_RN32(&src2[i*src_stride2]);\
1058        c= AV_RN32(&src3[i*src_stride3]);\
1059        d= AV_RN32(&src4[i*src_stride4]);\
1060        l0=  (a&0x03030303UL)\
1061           + (b&0x03030303UL)\
1062           + 0x01010101UL;\
1063        h0= ((a&0xFCFCFCFCUL)>>2)\
1064          + ((b&0xFCFCFCFCUL)>>2);\
1065        l1=  (c&0x03030303UL)\
1066           + (d&0x03030303UL);\
1067        h1= ((c&0xFCFCFCFCUL)>>2)\
1068          + ((d&0xFCFCFCFCUL)>>2);\
1069        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1070        a= AV_RN32(&src1[i*src_stride1+4]);\
1071        b= AV_RN32(&src2[i*src_stride2+4]);\
1072        c= AV_RN32(&src3[i*src_stride3+4]);\
1073        d= AV_RN32(&src4[i*src_stride4+4]);\
1074        l0=  (a&0x03030303UL)\
1075           + (b&0x03030303UL)\
1076           + 0x01010101UL;\
1077        h0= ((a&0xFCFCFCFCUL)>>2)\
1078          + ((b&0xFCFCFCFCUL)>>2);\
1079        l1=  (c&0x03030303UL)\
1080           + (d&0x03030303UL);\
1081        h1= ((c&0xFCFCFCFCUL)>>2)\
1082          + ((d&0xFCFCFCFCUL)>>2);\
1083        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1084    }\
1085}\
1086static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1087                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1088    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090}\
1091static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1092                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1093    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1095}\
1096\
1097static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1098{\
1099        int i, a0, b0, a1, b1;\
1100        a0= pixels[0];\
1101        b0= pixels[1] + 2;\
1102        a0 += b0;\
1103        b0 += pixels[2];\
1104\
1105        pixels+=line_size;\
1106        for(i=0; i<h; i+=2){\
1107            a1= pixels[0];\
1108            b1= pixels[1];\
1109            a1 += b1;\
1110            b1 += pixels[2];\
1111\
1112            block[0]= (a1+a0)>>2; /* FIXME non put */\
1113            block[1]= (b1+b0)>>2;\
1114\
1115            pixels+=line_size;\
1116            block +=line_size;\
1117\
1118            a0= pixels[0];\
1119            b0= pixels[1] + 2;\
1120            a0 += b0;\
1121            b0 += pixels[2];\
1122\
1123            block[0]= (a1+a0)>>2;\
1124            block[1]= (b1+b0)>>2;\
1125            pixels+=line_size;\
1126            block +=line_size;\
1127        }\
1128}\
1129\
1130static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1131{\
1132        int i;\
1133        const uint32_t a= AV_RN32(pixels  );\
1134        const uint32_t b= AV_RN32(pixels+1);\
1135        uint32_t l0=  (a&0x03030303UL)\
1136                    + (b&0x03030303UL)\
1137                    + 0x02020202UL;\
1138        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1139                   + ((b&0xFCFCFCFCUL)>>2);\
1140        uint32_t l1,h1;\
1141\
1142        pixels+=line_size;\
1143        for(i=0; i<h; i+=2){\
1144            uint32_t a= AV_RN32(pixels  );\
1145            uint32_t b= AV_RN32(pixels+1);\
1146            l1=  (a&0x03030303UL)\
1147               + (b&0x03030303UL);\
1148            h1= ((a&0xFCFCFCFCUL)>>2)\
1149              + ((b&0xFCFCFCFCUL)>>2);\
1150            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151            pixels+=line_size;\
1152            block +=line_size;\
1153            a= AV_RN32(pixels  );\
1154            b= AV_RN32(pixels+1);\
1155            l0=  (a&0x03030303UL)\
1156               + (b&0x03030303UL)\
1157               + 0x02020202UL;\
1158            h0= ((a&0xFCFCFCFCUL)>>2)\
1159              + ((b&0xFCFCFCFCUL)>>2);\
1160            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161            pixels+=line_size;\
1162            block +=line_size;\
1163        }\
1164}\
1165\
1166static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1167{\
1168    int j;\
1169    for(j=0; j<2; j++){\
1170        int i;\
1171        const uint32_t a= AV_RN32(pixels  );\
1172        const uint32_t b= AV_RN32(pixels+1);\
1173        uint32_t l0=  (a&0x03030303UL)\
1174                    + (b&0x03030303UL)\
1175                    + 0x02020202UL;\
1176        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1177                   + ((b&0xFCFCFCFCUL)>>2);\
1178        uint32_t l1,h1;\
1179\
1180        pixels+=line_size;\
1181        for(i=0; i<h; i+=2){\
1182            uint32_t a= AV_RN32(pixels  );\
1183            uint32_t b= AV_RN32(pixels+1);\
1184            l1=  (a&0x03030303UL)\
1185               + (b&0x03030303UL);\
1186            h1= ((a&0xFCFCFCFCUL)>>2)\
1187              + ((b&0xFCFCFCFCUL)>>2);\
1188            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189            pixels+=line_size;\
1190            block +=line_size;\
1191            a= AV_RN32(pixels  );\
1192            b= AV_RN32(pixels+1);\
1193            l0=  (a&0x03030303UL)\
1194               + (b&0x03030303UL)\
1195               + 0x02020202UL;\
1196            h0= ((a&0xFCFCFCFCUL)>>2)\
1197              + ((b&0xFCFCFCFCUL)>>2);\
1198            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1199            pixels+=line_size;\
1200            block +=line_size;\
1201        }\
1202        pixels+=4-line_size*(h+1);\
1203        block +=4-line_size*h;\
1204    }\
1205}\
1206\
1207static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1208{\
1209    int j;\
1210    for(j=0; j<2; j++){\
1211        int i;\
1212        const uint32_t a= AV_RN32(pixels  );\
1213        const uint32_t b= AV_RN32(pixels+1);\
1214        uint32_t l0=  (a&0x03030303UL)\
1215                    + (b&0x03030303UL)\
1216                    + 0x01010101UL;\
1217        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1218                   + ((b&0xFCFCFCFCUL)>>2);\
1219        uint32_t l1,h1;\
1220\
1221        pixels+=line_size;\
1222        for(i=0; i<h; i+=2){\
1223            uint32_t a= AV_RN32(pixels  );\
1224            uint32_t b= AV_RN32(pixels+1);\
1225            l1=  (a&0x03030303UL)\
1226               + (b&0x03030303UL);\
1227            h1= ((a&0xFCFCFCFCUL)>>2)\
1228              + ((b&0xFCFCFCFCUL)>>2);\
1229            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1230            pixels+=line_size;\
1231            block +=line_size;\
1232            a= AV_RN32(pixels  );\
1233            b= AV_RN32(pixels+1);\
1234            l0=  (a&0x03030303UL)\
1235               + (b&0x03030303UL)\
1236               + 0x01010101UL;\
1237            h0= ((a&0xFCFCFCFCUL)>>2)\
1238              + ((b&0xFCFCFCFCUL)>>2);\
1239            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1240            pixels+=line_size;\
1241            block +=line_size;\
1242        }\
1243        pixels+=4-line_size*(h+1);\
1244        block +=4-line_size*h;\
1245    }\
1246}\
1247\
1248CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1249CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1250CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1251CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1252CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1253CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1254CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1255CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1256
1257#define op_avg(a, b) a = rnd_avg32(a, b)
1258#endif
1259#define op_put(a, b) a = b
1260
1261PIXOP2(avg, op_avg)
1262PIXOP2(put, op_put)
1263#undef op_avg
1264#undef op_put
1265
1266#define avg2(a,b) ((a+b+1)>>1)
1267#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1268
1269static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1270    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1271}
1272
1273static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1274    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1275}
1276
1277static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1278{
1279    const int A=(16-x16)*(16-y16);
1280    const int B=(   x16)*(16-y16);
1281    const int C=(16-x16)*(   y16);
1282    const int D=(   x16)*(   y16);
1283    int i;
1284
1285    for(i=0; i<h; i++)
1286    {
1287        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1288        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1289        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1290        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1291        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1292        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1293        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1294        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1295        dst+= stride;
1296        src+= stride;
1297    }
1298}
1299
1300void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1301                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1302{
1303    int y, vx, vy;
1304    const int s= 1<<shift;
1305
1306    width--;
1307    height--;
1308
1309    for(y=0; y<h; y++){
1310        int x;
1311
1312        vx= ox;
1313        vy= oy;
1314        for(x=0; x<8; x++){ //XXX FIXME optimize
1315            int src_x, src_y, frac_x, frac_y, index;
1316
1317            src_x= vx>>16;
1318            src_y= vy>>16;
1319            frac_x= src_x&(s-1);
1320            frac_y= src_y&(s-1);
1321            src_x>>=shift;
1322            src_y>>=shift;
1323
1324            if((unsigned)src_x < width){
1325                if((unsigned)src_y < height){
1326                    index= src_x + src_y*stride;
1327                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1328                                           + src[index       +1]*   frac_x )*(s-frac_y)
1329                                        + (  src[index+stride  ]*(s-frac_x)
1330                                           + src[index+stride+1]*   frac_x )*   frac_y
1331                                        + r)>>(shift*2);
1332                }else{
1333                    index= src_x + av_clip(src_y, 0, height)*stride;
1334                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1335                                          + src[index       +1]*   frac_x )*s
1336                                        + r)>>(shift*2);
1337                }
1338            }else{
1339                if((unsigned)src_y < height){
1340                    index= av_clip(src_x, 0, width) + src_y*stride;
1341                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1342                                           + src[index+stride  ]*   frac_y )*s
1343                                        + r)>>(shift*2);
1344                }else{
1345                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1346                    dst[y*stride + x]=    src[index         ];
1347                }
1348            }
1349
1350            vx+= dxx;
1351            vy+= dyx;
1352        }
1353        ox += dxy;
1354        oy += dyy;
1355    }
1356}
1357
1358static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359    switch(width){
1360    case 2: put_pixels2_c (dst, src, stride, height); break;
1361    case 4: put_pixels4_c (dst, src, stride, height); break;
1362    case 8: put_pixels8_c (dst, src, stride, height); break;
1363    case 16:put_pixels16_c(dst, src, stride, height); break;
1364    }
1365}
1366
1367static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368    int i,j;
1369    for (i=0; i < height; i++) {
1370      for (j=0; j < width; j++) {
1371        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1372      }
1373      src += stride;
1374      dst += stride;
1375    }
1376}
1377
1378static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379    int i,j;
1380    for (i=0; i < height; i++) {
1381      for (j=0; j < width; j++) {
1382        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1383      }
1384      src += stride;
1385      dst += stride;
1386    }
1387}
1388
1389static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390    int i,j;
1391    for (i=0; i < height; i++) {
1392      for (j=0; j < width; j++) {
1393        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1394      }
1395      src += stride;
1396      dst += stride;
1397    }
1398}
1399
1400static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1401    int i,j;
1402    for (i=0; i < height; i++) {
1403      for (j=0; j < width; j++) {
1404        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1405      }
1406      src += stride;
1407      dst += stride;
1408    }
1409}
1410
1411static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1412    int i,j;
1413    for (i=0; i < height; i++) {
1414      for (j=0; j < width; j++) {
1415        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1416      }
1417      src += stride;
1418      dst += stride;
1419    }
1420}
1421
1422static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1423    int i,j;
1424    for (i=0; i < height; i++) {
1425      for (j=0; j < width; j++) {
1426        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1427      }
1428      src += stride;
1429      dst += stride;
1430    }
1431}
1432
1433static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434    int i,j;
1435    for (i=0; i < height; i++) {
1436      for (j=0; j < width; j++) {
1437        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1438      }
1439      src += stride;
1440      dst += stride;
1441    }
1442}
1443
1444static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445    int i,j;
1446    for (i=0; i < height; i++) {
1447      for (j=0; j < width; j++) {
1448        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1449      }
1450      src += stride;
1451      dst += stride;
1452    }
1453}
1454
1455static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456    switch(width){
1457    case 2: avg_pixels2_c (dst, src, stride, height); break;
1458    case 4: avg_pixels4_c (dst, src, stride, height); break;
1459    case 8: avg_pixels8_c (dst, src, stride, height); break;
1460    case 16:avg_pixels16_c(dst, src, stride, height); break;
1461    }
1462}
1463
1464static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1465    int i,j;
1466    for (i=0; i < height; i++) {
1467      for (j=0; j < width; j++) {
1468        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1469      }
1470      src += stride;
1471      dst += stride;
1472    }
1473}
1474
1475static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1476    int i,j;
1477    for (i=0; i < height; i++) {
1478      for (j=0; j < width; j++) {
1479        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1480      }
1481      src += stride;
1482      dst += stride;
1483    }
1484}
1485
1486static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1487    int i,j;
1488    for (i=0; i < height; i++) {
1489      for (j=0; j < width; j++) {
1490        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1491      }
1492      src += stride;
1493      dst += stride;
1494    }
1495}
1496
1497static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1498    int i,j;
1499    for (i=0; i < height; i++) {
1500      for (j=0; j < width; j++) {
1501        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1502      }
1503      src += stride;
1504      dst += stride;
1505    }
1506}
1507
1508static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1509    int i,j;
1510    for (i=0; i < height; i++) {
1511      for (j=0; j < width; j++) {
1512        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1513      }
1514      src += stride;
1515      dst += stride;
1516    }
1517}
1518
1519static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1520    int i,j;
1521    for (i=0; i < height; i++) {
1522      for (j=0; j < width; j++) {
1523        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1524      }
1525      src += stride;
1526      dst += stride;
1527    }
1528}
1529
1530static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1531    int i,j;
1532    for (i=0; i < height; i++) {
1533      for (j=0; j < width; j++) {
1534        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1535      }
1536      src += stride;
1537      dst += stride;
1538    }
1539}
1540
1541static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1542    int i,j;
1543    for (i=0; i < height; i++) {
1544      for (j=0; j < width; j++) {
1545        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1546      }
1547      src += stride;
1548      dst += stride;
1549    }
1550}
1551#if 0
1552#define TPEL_WIDTH(width)\
1553static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1554    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1555static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1557static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1559static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1561static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1563static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1565static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1567static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1568    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1569static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1570    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1571#endif
1572
1573#define H264_CHROMA_MC(OPNAME, OP)\
1574static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1575    const int A=(8-x)*(8-y);\
1576    const int B=(  x)*(8-y);\
1577    const int C=(8-x)*(  y);\
1578    const int D=(  x)*(  y);\
1579    int i;\
1580    \
1581    assert(x<8 && y<8 && x>=0 && y>=0);\
1582\
1583    if(D){\
1584        for(i=0; i<h; i++){\
1585            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1586            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1587            dst+= stride;\
1588            src+= stride;\
1589        }\
1590    }else{\
1591        const int E= B+C;\
1592        const int step= C ? stride : 1;\
1593        for(i=0; i<h; i++){\
1594            OP(dst[0], (A*src[0] + E*src[step+0]));\
1595            OP(dst[1], (A*src[1] + E*src[step+1]));\
1596            dst+= stride;\
1597            src+= stride;\
1598        }\
1599    }\
1600}\
1601\
1602static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1603    const int A=(8-x)*(8-y);\
1604    const int B=(  x)*(8-y);\
1605    const int C=(8-x)*(  y);\
1606    const int D=(  x)*(  y);\
1607    int i;\
1608    \
1609    assert(x<8 && y<8 && x>=0 && y>=0);\
1610\
1611    if(D){\
1612        for(i=0; i<h; i++){\
1613            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1614            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1615            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1616            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1617            dst+= stride;\
1618            src+= stride;\
1619        }\
1620    }else{\
1621        const int E= B+C;\
1622        const int step= C ? stride : 1;\
1623        for(i=0; i<h; i++){\
1624            OP(dst[0], (A*src[0] + E*src[step+0]));\
1625            OP(dst[1], (A*src[1] + E*src[step+1]));\
1626            OP(dst[2], (A*src[2] + E*src[step+2]));\
1627            OP(dst[3], (A*src[3] + E*src[step+3]));\
1628            dst+= stride;\
1629            src+= stride;\
1630        }\
1631    }\
1632}\
1633\
1634static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1635    const int A=(8-x)*(8-y);\
1636    const int B=(  x)*(8-y);\
1637    const int C=(8-x)*(  y);\
1638    const int D=(  x)*(  y);\
1639    int i;\
1640    \
1641    assert(x<8 && y<8 && x>=0 && y>=0);\
1642\
1643    if(D){\
1644        for(i=0; i<h; i++){\
1645            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1646            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1647            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1648            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1649            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1650            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1651            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1652            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1653            dst+= stride;\
1654            src+= stride;\
1655        }\
1656    }else{\
1657        const int E= B+C;\
1658        const int step= C ? stride : 1;\
1659        for(i=0; i<h; i++){\
1660            OP(dst[0], (A*src[0] + E*src[step+0]));\
1661            OP(dst[1], (A*src[1] + E*src[step+1]));\
1662            OP(dst[2], (A*src[2] + E*src[step+2]));\
1663            OP(dst[3], (A*src[3] + E*src[step+3]));\
1664            OP(dst[4], (A*src[4] + E*src[step+4]));\
1665            OP(dst[5], (A*src[5] + E*src[step+5]));\
1666            OP(dst[6], (A*src[6] + E*src[step+6]));\
1667            OP(dst[7], (A*src[7] + E*src[step+7]));\
1668            dst+= stride;\
1669            src+= stride;\
1670        }\
1671    }\
1672}
1673
1674#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1675#define op_put(a, b) a = (((b) + 32)>>6)
1676
1677H264_CHROMA_MC(put_       , op_put)
1678H264_CHROMA_MC(avg_       , op_avg)
1679#undef op_avg
1680#undef op_put
1681
1682static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1683    const int A=(8-x)*(8-y);
1684    const int B=(  x)*(8-y);
1685    const int C=(8-x)*(  y);
1686    const int D=(  x)*(  y);
1687    int i;
1688
1689    assert(x<8 && y<8 && x>=0 && y>=0);
1690
1691    for(i=0; i<h; i++)
1692    {
1693        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1694        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1695        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1696        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1697        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1698        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1699        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1700        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1701        dst+= stride;
1702        src+= stride;
1703    }
1704}
1705
1706#define QPEL_MC(r, OPNAME, RND, OP) \
1707static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1708    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1709    int i;\
1710    for(i=0; i<h; i++)\
1711    {\
1712        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1713        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1714        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1715        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1716        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1717        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1718        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1719        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1720        dst+=dstStride;\
1721        src+=srcStride;\
1722    }\
1723}\
1724\
1725static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1726    const int w=8;\
1727    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1728    int i;\
1729    for(i=0; i<w; i++)\
1730    {\
1731        const int src0= src[0*srcStride];\
1732        const int src1= src[1*srcStride];\
1733        const int src2= src[2*srcStride];\
1734        const int src3= src[3*srcStride];\
1735        const int src4= src[4*srcStride];\
1736        const int src5= src[5*srcStride];\
1737        const int src6= src[6*srcStride];\
1738        const int src7= src[7*srcStride];\
1739        const int src8= src[8*srcStride];\
1740        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1741        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1742        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1743        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1744        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1745        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1746        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1747        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1748        dst++;\
1749        src++;\
1750    }\
1751}\
1752\
1753static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1754    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1755    int i;\
1756    \
1757    for(i=0; i<h; i++)\
1758    {\
1759        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1760        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1761        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1762        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1763        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1764        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1765        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1766        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1767        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1768        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1769        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1770        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1771        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1772        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1773        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1774        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1775        dst+=dstStride;\
1776        src+=srcStride;\
1777    }\
1778}\
1779\
1780static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1781    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1782    int i;\
1783    const int w=16;\
1784    for(i=0; i<w; i++)\
1785    {\
1786        const int src0= src[0*srcStride];\
1787        const int src1= src[1*srcStride];\
1788        const int src2= src[2*srcStride];\
1789        const int src3= src[3*srcStride];\
1790        const int src4= src[4*srcStride];\
1791        const int src5= src[5*srcStride];\
1792        const int src6= src[6*srcStride];\
1793        const int src7= src[7*srcStride];\
1794        const int src8= src[8*srcStride];\
1795        const int src9= src[9*srcStride];\
1796        const int src10= src[10*srcStride];\
1797        const int src11= src[11*srcStride];\
1798        const int src12= src[12*srcStride];\
1799        const int src13= src[13*srcStride];\
1800        const int src14= src[14*srcStride];\
1801        const int src15= src[15*srcStride];\
1802        const int src16= src[16*srcStride];\
1803        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1804        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1805        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1806        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1807        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1808        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1809        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1810        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1811        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1812        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1813        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1814        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1815        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1816        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1817        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1818        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1819        dst++;\
1820        src++;\
1821    }\
1822}\
1823\
1824static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1825    OPNAME ## pixels8_c(dst, src, stride, 8);\
1826}\
1827\
1828static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1829    uint8_t half[64];\
1830    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1831    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1832}\
1833\
1834static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1835    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1836}\
1837\
1838static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1839    uint8_t half[64];\
1840    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1841    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1842}\
1843\
1844static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1845    uint8_t full[16*9];\
1846    uint8_t half[64];\
1847    copy_block9(full, src, 16, stride, 9);\
1848    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1849    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1850}\
1851\
1852static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1853    uint8_t full[16*9];\
1854    copy_block9(full, src, 16, stride, 9);\
1855    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1856}\
1857\
1858static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1859    uint8_t full[16*9];\
1860    uint8_t half[64];\
1861    copy_block9(full, src, 16, stride, 9);\
1862    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1863    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1864}\
1865void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1866    uint8_t full[16*9];\
1867    uint8_t halfH[72];\
1868    uint8_t halfV[64];\
1869    uint8_t halfHV[64];\
1870    copy_block9(full, src, 16, stride, 9);\
1871    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1872    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1873    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1874    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1875}\
1876static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1877    uint8_t full[16*9];\
1878    uint8_t halfH[72];\
1879    uint8_t halfHV[64];\
1880    copy_block9(full, src, 16, stride, 9);\
1881    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1882    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1883    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1884    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1885}\
1886void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1887    uint8_t full[16*9];\
1888    uint8_t halfH[72];\
1889    uint8_t halfV[64];\
1890    uint8_t halfHV[64];\
1891    copy_block9(full, src, 16, stride, 9);\
1892    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1893    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1894    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1896}\
1897static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1898    uint8_t full[16*9];\
1899    uint8_t halfH[72];\
1900    uint8_t halfHV[64];\
1901    copy_block9(full, src, 16, stride, 9);\
1902    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1903    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1904    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1905    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1906}\
1907void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1908    uint8_t full[16*9];\
1909    uint8_t halfH[72];\
1910    uint8_t halfV[64];\
1911    uint8_t halfHV[64];\
1912    copy_block9(full, src, 16, stride, 9);\
1913    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1914    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1915    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1916    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1917}\
1918static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1919    uint8_t full[16*9];\
1920    uint8_t halfH[72];\
1921    uint8_t halfHV[64];\
1922    copy_block9(full, src, 16, stride, 9);\
1923    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1924    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1925    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1926    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1927}\
1928void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1929    uint8_t full[16*9];\
1930    uint8_t halfH[72];\
1931    uint8_t halfV[64];\
1932    uint8_t halfHV[64];\
1933    copy_block9(full, src, 16, stride, 9);\
1934    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1935    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1936    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1937    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1938}\
1939static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1940    uint8_t full[16*9];\
1941    uint8_t halfH[72];\
1942    uint8_t halfHV[64];\
1943    copy_block9(full, src, 16, stride, 9);\
1944    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1945    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1946    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1947    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1948}\
1949static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1950    uint8_t halfH[72];\
1951    uint8_t halfHV[64];\
1952    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1953    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1954    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1955}\
1956static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1957    uint8_t halfH[72];\
1958    uint8_t halfHV[64];\
1959    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1960    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1961    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1962}\
1963void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1964    uint8_t full[16*9];\
1965    uint8_t halfH[72];\
1966    uint8_t halfV[64];\
1967    uint8_t halfHV[64];\
1968    copy_block9(full, src, 16, stride, 9);\
1969    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1970    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1971    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1972    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1973}\
1974static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1975    uint8_t full[16*9];\
1976    uint8_t halfH[72];\
1977    copy_block9(full, src, 16, stride, 9);\
1978    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1979    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1980    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1981}\
1982void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1983    uint8_t full[16*9];\
1984    uint8_t halfH[72];\
1985    uint8_t halfV[64];\
1986    uint8_t halfHV[64];\
1987    copy_block9(full, src, 16, stride, 9);\
1988    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1989    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1990    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1991    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1992}\
1993static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1994    uint8_t full[16*9];\
1995    uint8_t halfH[72];\
1996    copy_block9(full, src, 16, stride, 9);\
1997    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1998    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1999    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2000}\
2001static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2002    uint8_t halfH[72];\
2003    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2004    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2005}\
2006static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2007    OPNAME ## pixels16_c(dst, src, stride, 16);\
2008}\
2009\
2010static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2011    uint8_t half[256];\
2012    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2013    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2014}\
2015\
2016static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2017    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2018}\
2019\
2020static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2021    uint8_t half[256];\
2022    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2023    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2024}\
2025\
2026static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2027    uint8_t full[24*17];\
2028    uint8_t half[256];\
2029    copy_block17(full, src, 24, stride, 17);\
2030    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2031    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2032}\
2033\
2034static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2035    uint8_t full[24*17];\
2036    copy_block17(full, src, 24, stride, 17);\
2037    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2038}\
2039\
2040static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2041    uint8_t full[24*17];\
2042    uint8_t half[256];\
2043    copy_block17(full, src, 24, stride, 17);\
2044    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2045    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2046}\
2047void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2048    uint8_t full[24*17];\
2049    uint8_t halfH[272];\
2050    uint8_t halfV[256];\
2051    uint8_t halfHV[256];\
2052    copy_block17(full, src, 24, stride, 17);\
2053    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2054    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2055    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2056    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2057}\
2058static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2059    uint8_t full[24*17];\
2060    uint8_t halfH[272];\
2061    uint8_t halfHV[256];\
2062    copy_block17(full, src, 24, stride, 17);\
2063    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2064    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2065    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2066    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2067}\
2068void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2069    uint8_t full[24*17];\
2070    uint8_t halfH[272];\
2071    uint8_t halfV[256];\
2072    uint8_t halfHV[256];\
2073    copy_block17(full, src, 24, stride, 17);\
2074    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2075    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2076    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2077    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2078}\
2079static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2080    uint8_t full[24*17];\
2081    uint8_t halfH[272];\
2082    uint8_t halfHV[256];\
2083    copy_block17(full, src, 24, stride, 17);\
2084    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2085    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2086    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2087    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2088}\
2089void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2090    uint8_t full[24*17];\
2091    uint8_t halfH[272];\
2092    uint8_t halfV[256];\
2093    uint8_t halfHV[256];\
2094    copy_block17(full, src, 24, stride, 17);\
2095    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2096    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2097    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2098    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2099}\
2100static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2101    uint8_t full[24*17];\
2102    uint8_t halfH[272];\
2103    uint8_t halfHV[256];\
2104    copy_block17(full, src, 24, stride, 17);\
2105    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2106    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2107    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2108    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2109}\
2110void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2111    uint8_t full[24*17];\
2112    uint8_t halfH[272];\
2113    uint8_t halfV[256];\
2114    uint8_t halfHV[256];\
2115    copy_block17(full, src, 24, stride, 17);\
2116    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2117    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2118    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2119    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2120}\
2121static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2122    uint8_t full[24*17];\
2123    uint8_t halfH[272];\
2124    uint8_t halfHV[256];\
2125    copy_block17(full, src, 24, stride, 17);\
2126    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2127    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2128    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2129    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2130}\
2131static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2132    uint8_t halfH[272];\
2133    uint8_t halfHV[256];\
2134    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2135    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2136    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2137}\
2138static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2139    uint8_t halfH[272];\
2140    uint8_t halfHV[256];\
2141    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2142    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2143    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2144}\
2145void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2146    uint8_t full[24*17];\
2147    uint8_t halfH[272];\
2148    uint8_t halfV[256];\
2149    uint8_t halfHV[256];\
2150    copy_block17(full, src, 24, stride, 17);\
2151    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2152    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2153    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2154    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2155}\
2156static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2157    uint8_t full[24*17];\
2158    uint8_t halfH[272];\
2159    copy_block17(full, src, 24, stride, 17);\
2160    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2161    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2162    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2163}\
2164void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2165    uint8_t full[24*17];\
2166    uint8_t halfH[272];\
2167    uint8_t halfV[256];\
2168    uint8_t halfHV[256];\
2169    copy_block17(full, src, 24, stride, 17);\
2170    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2171    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2172    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2173    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2174}\
2175static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2176    uint8_t full[24*17];\
2177    uint8_t halfH[272];\
2178    copy_block17(full, src, 24, stride, 17);\
2179    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2180    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2181    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2182}\
2183static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2184    uint8_t halfH[272];\
2185    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2186    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2187}
2188
2189#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2190#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2191#define op_put(a, b) a = cm[((b) + 16)>>5]
2192#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2193
2194QPEL_MC(0, put_       , _       , op_put)
2195QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2196QPEL_MC(0, avg_       , _       , op_avg)
2197//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2198#undef op_avg
2199#undef op_avg_no_rnd
2200#undef op_put
2201#undef op_put_no_rnd
2202
2203#if 1
2204#define H264_LOWPASS(OPNAME, OP, OP2) \
2205static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2206    const int h=2;\
2207    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2208    int i;\
2209    for(i=0; i<h; i++)\
2210    {\
2211        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2212        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2213        dst+=dstStride;\
2214        src+=srcStride;\
2215    }\
2216}\
2217\
2218static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2219    const int w=2;\
2220    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2221    int i;\
2222    for(i=0; i<w; i++)\
2223    {\
2224        const int srcB= src[-2*srcStride];\
2225        const int srcA= src[-1*srcStride];\
2226        const int src0= src[0 *srcStride];\
2227        const int src1= src[1 *srcStride];\
2228        const int src2= src[2 *srcStride];\
2229        const int src3= src[3 *srcStride];\
2230        const int src4= src[4 *srcStride];\
2231        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2232        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2233        dst++;\
2234        src++;\
2235    }\
2236}\
2237\
2238static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2239    const int h=2;\
2240    const int w=2;\
2241    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2242    int i;\
2243    src -= 2*srcStride;\
2244    for(i=0; i<h+5; i++)\
2245    {\
2246        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2247        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2248        tmp+=tmpStride;\
2249        src+=srcStride;\
2250    }\
2251    tmp -= tmpStride*(h+5-2);\
2252    for(i=0; i<w; i++)\
2253    {\
2254        const int tmpB= tmp[-2*tmpStride];\
2255        const int tmpA= tmp[-1*tmpStride];\
2256        const int tmp0= tmp[0 *tmpStride];\
2257        const int tmp1= tmp[1 *tmpStride];\
2258        const int tmp2= tmp[2 *tmpStride];\
2259        const int tmp3= tmp[3 *tmpStride];\
2260        const int tmp4= tmp[4 *tmpStride];\
2261        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2262        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2263        dst++;\
2264        tmp++;\
2265    }\
2266}\
2267static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2268    const int h=4;\
2269    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2270    int i;\
2271    for(i=0; i<h; i++)\
2272    {\
2273        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2274        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2275        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2276        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2277        dst+=dstStride;\
2278        src+=srcStride;\
2279    }\
2280}\
2281\
2282static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2283    const int w=4;\
2284    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2285    int i;\
2286    for(i=0; i<w; i++)\
2287    {\
2288        const int srcB= src[-2*srcStride];\
2289        const int srcA= src[-1*srcStride];\
2290        const int src0= src[0 *srcStride];\
2291        const int src1= src[1 *srcStride];\
2292        const int src2= src[2 *srcStride];\
2293        const int src3= src[3 *srcStride];\
2294        const int src4= src[4 *srcStride];\
2295        const int src5= src[5 *srcStride];\
2296        const int src6= src[6 *srcStride];\
2297        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2298        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2299        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2300        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2301        dst++;\
2302        src++;\
2303    }\
2304}\
2305\
2306static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2307    const int h=4;\
2308    const int w=4;\
2309    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2310    int i;\
2311    src -= 2*srcStride;\
2312    for(i=0; i<h+5; i++)\
2313    {\
2314        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2315        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2316        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2317        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2318        tmp+=tmpStride;\
2319        src+=srcStride;\
2320    }\
2321    tmp -= tmpStride*(h+5-2);\
2322    for(i=0; i<w; i++)\
2323    {\
2324        const int tmpB= tmp[-2*tmpStride];\
2325        const int tmpA= tmp[-1*tmpStride];\
2326        const int tmp0= tmp[0 *tmpStride];\
2327        const int tmp1= tmp[1 *tmpStride];\
2328        const int tmp2= tmp[2 *tmpStride];\
2329        const int tmp3= tmp[3 *tmpStride];\
2330        const int tmp4= tmp[4 *tmpStride];\
2331        const int tmp5= tmp[5 *tmpStride];\
2332        const int tmp6= tmp[6 *tmpStride];\
2333        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2334        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2335        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2336        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2337        dst++;\
2338        tmp++;\
2339    }\
2340}\
2341\
2342static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2343    const int h=8;\
2344    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2345    int i;\
2346    for(i=0; i<h; i++)\
2347    {\
2348        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2349        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2350        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2351        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2352        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2353        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2354        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2355        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2356        dst+=dstStride;\
2357        src+=srcStride;\
2358    }\
2359}\
2360\
2361static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2362    const int w=8;\
2363    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2364    int i;\
2365    for(i=0; i<w; i++)\
2366    {\
2367        const int srcB= src[-2*srcStride];\
2368        const int srcA= src[-1*srcStride];\
2369        const int src0= src[0 *srcStride];\
2370        const int src1= src[1 *srcStride];\
2371        const int src2= src[2 *srcStride];\
2372        const int src3= src[3 *srcStride];\
2373        const int src4= src[4 *srcStride];\
2374        const int src5= src[5 *srcStride];\
2375        const int src6= src[6 *srcStride];\
2376        const int src7= src[7 *srcStride];\
2377        const int src8= src[8 *srcStride];\
2378        const int src9= src[9 *srcStride];\
2379        const int src10=src[10*srcStride];\
2380        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2381        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2382        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2383        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2384        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2385        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2386        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2387        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2388        dst++;\
2389        src++;\
2390    }\
2391}\
2392\
2393static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2394    const int h=8;\
2395    const int w=8;\
2396    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2397    int i;\
2398    src -= 2*srcStride;\
2399    for(i=0; i<h+5; i++)\
2400    {\
2401        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2402        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2403        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2404        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2405        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2406        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2407        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2408        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2409        tmp+=tmpStride;\
2410        src+=srcStride;\
2411    }\
2412    tmp -= tmpStride*(h+5-2);\
2413    for(i=0; i<w; i++)\
2414    {\
2415        const int tmpB= tmp[-2*tmpStride];\
2416        const int tmpA= tmp[-1*tmpStride];\
2417        const int tmp0= tmp[0 *tmpStride];\
2418        const int tmp1= tmp[1 *tmpStride];\
2419        const int tmp2= tmp[2 *tmpStride];\
2420        const int tmp3= tmp[3 *tmpStride];\
2421        const int tmp4= tmp[4 *tmpStride];\
2422        const int tmp5= tmp[5 *tmpStride];\
2423        const int tmp6= tmp[6 *tmpStride];\
2424        const int tmp7= tmp[7 *tmpStride];\
2425        const int tmp8= tmp[8 *tmpStride];\
2426        const int tmp9= tmp[9 *tmpStride];\
2427        const int tmp10=tmp[10*tmpStride];\
2428        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2429        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2430        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2431        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2432        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2433        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2434        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2435        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2436        dst++;\
2437        tmp++;\
2438    }\
2439}\
2440\
2441static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2442    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2443    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2444    src += 8*srcStride;\
2445    dst += 8*dstStride;\
2446    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2447    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2448}\
2449\
2450static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2451    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2452    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2453    src += 8*srcStride;\
2454    dst += 8*dstStride;\
2455    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2456    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2457}\
2458\
2459static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2460    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2461    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2462    src += 8*srcStride;\
2463    dst += 8*dstStride;\
2464    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2465    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2466}\
2467
2468#define H264_MC(OPNAME, SIZE) \
2469static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2470    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2471}\
2472\
2473static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2474    uint8_t half[SIZE*SIZE];\
2475    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2476    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2477}\
2478\
2479static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2480    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2481}\
2482\
2483static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2484    uint8_t half[SIZE*SIZE];\
2485    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2486    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2487}\
2488\
2489static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2490    uint8_t full[SIZE*(SIZE+5)];\
2491    uint8_t * const full_mid= full + SIZE*2;\
2492    uint8_t half[SIZE*SIZE];\
2493    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2494    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2495    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2496}\
2497\
2498static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2499    uint8_t full[SIZE*(SIZE+5)];\
2500    uint8_t * const full_mid= full + SIZE*2;\
2501    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2502    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2503}\
2504\
2505static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2506    uint8_t full[SIZE*(SIZE+5)];\
2507    uint8_t * const full_mid= full + SIZE*2;\
2508    uint8_t half[SIZE*SIZE];\
2509    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2510    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2511    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2512}\
2513\
2514static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2515    uint8_t full[SIZE*(SIZE+5)];\
2516    uint8_t * const full_mid= full + SIZE*2;\
2517    uint8_t halfH[SIZE*SIZE];\
2518    uint8_t halfV[SIZE*SIZE];\
2519    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2520    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2521    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2522    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2523}\
2524\
2525static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2526    uint8_t full[SIZE*(SIZE+5)];\
2527    uint8_t * const full_mid= full + SIZE*2;\
2528    uint8_t halfH[SIZE*SIZE];\
2529    uint8_t halfV[SIZE*SIZE];\
2530    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2531    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2532    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2533    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2534}\
2535\
2536static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2537    uint8_t full[SIZE*(SIZE+5)];\
2538    uint8_t * const full_mid= full + SIZE*2;\
2539    uint8_t halfH[SIZE*SIZE];\
2540    uint8_t halfV[SIZE*SIZE];\
2541    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2542    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2543    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2544    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2545}\
2546\
2547static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2548    uint8_t full[SIZE*(SIZE+5)];\
2549    uint8_t * const full_mid= full + SIZE*2;\
2550    uint8_t halfH[SIZE*SIZE];\
2551    uint8_t halfV[SIZE*SIZE];\
2552    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2553    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2554    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2555    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2556}\
2557\
2558static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2559    int16_t tmp[SIZE*(SIZE+5)];\
2560    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2561}\
2562\
2563static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2564    int16_t tmp[SIZE*(SIZE+5)];\
2565    uint8_t halfH[SIZE*SIZE];\
2566    uint8_t halfHV[SIZE*SIZE];\
2567    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2568    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2569    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2570}\
2571\
2572static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2573    int16_t tmp[SIZE*(SIZE+5)];\
2574    uint8_t halfH[SIZE*SIZE];\
2575    uint8_t halfHV[SIZE*SIZE];\
2576    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2577    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2578    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2579}\
2580\
2581static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2582    uint8_t full[SIZE*(SIZE+5)];\
2583    uint8_t * const full_mid= full + SIZE*2;\
2584    int16_t tmp[SIZE*(SIZE+5)];\
2585    uint8_t halfV[SIZE*SIZE];\
2586    uint8_t halfHV[SIZE*SIZE];\
2587    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2588    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2589    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2590    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2591}\
2592\
2593static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2594    uint8_t full[SIZE*(SIZE+5)];\
2595    uint8_t * const full_mid= full + SIZE*2;\
2596    int16_t tmp[SIZE*(SIZE+5)];\
2597    uint8_t halfV[SIZE*SIZE];\
2598    uint8_t halfHV[SIZE*SIZE];\
2599    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2600    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2601    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2602    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2603}\
2604
2605#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2606//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2607#define op_put(a, b)  a = cm[((b) + 16)>>5]
2608#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2609#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2610
2611H264_LOWPASS(put_       , op_put, op2_put)
2612H264_LOWPASS(avg_       , op_avg, op2_avg)
2613H264_MC(put_, 2)
2614H264_MC(put_, 4)
2615H264_MC(put_, 8)
2616H264_MC(put_, 16)
2617H264_MC(avg_, 4)
2618H264_MC(avg_, 8)
2619H264_MC(avg_, 16)
2620
2621#undef op_avg
2622#undef op_put
2623#undef op2_avg
2624#undef op2_put
2625#endif
2626
2627#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2628#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2629#define H264_WEIGHT(W,H) \
2630static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2631    int y; \
2632    offset <<= log2_denom; \
2633    if(log2_denom) offset += 1<<(log2_denom-1); \
2634    for(y=0; y<H; y++, block += stride){ \
2635        op_scale1(0); \
2636        op_scale1(1); \
2637        if(W==2) continue; \
2638        op_scale1(2); \
2639        op_scale1(3); \
2640        if(W==4) continue; \
2641        op_scale1(4); \
2642        op_scale1(5); \
2643        op_scale1(6); \
2644        op_scale1(7); \
2645        if(W==8) continue; \
2646        op_scale1(8); \
2647        op_scale1(9); \
2648        op_scale1(10); \
2649        op_scale1(11); \
2650        op_scale1(12); \
2651        op_scale1(13); \
2652        op_scale1(14); \
2653        op_scale1(15); \
2654    } \
2655} \
2656static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2657    int y; \
2658    offset = ((offset + 1) | 1) << log2_denom; \
2659    for(y=0; y<H; y++, dst += stride, src += stride){ \
2660        op_scale2(0); \
2661        op_scale2(1); \
2662        if(W==2) continue; \
2663        op_scale2(2); \
2664        op_scale2(3); \
2665        if(W==4) continue; \
2666        op_scale2(4); \
2667        op_scale2(5); \
2668        op_scale2(6); \
2669        op_scale2(7); \
2670        if(W==8) continue; \
2671        op_scale2(8); \
2672        op_scale2(9); \
2673        op_scale2(10); \
2674        op_scale2(11); \
2675        op_scale2(12); \
2676        op_scale2(13); \
2677        op_scale2(14); \
2678        op_scale2(15); \
2679    } \
2680}
2681
2682H264_WEIGHT(16,16)
2683H264_WEIGHT(16,8)
2684H264_WEIGHT(8,16)
2685H264_WEIGHT(8,8)
2686H264_WEIGHT(8,4)
2687H264_WEIGHT(4,8)
2688H264_WEIGHT(4,4)
2689H264_WEIGHT(4,2)
2690H264_WEIGHT(2,4)
2691H264_WEIGHT(2,2)
2692
2693#undef op_scale1
2694#undef op_scale2
2695#undef H264_WEIGHT
2696
2697static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2698    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2699    int i;
2700
2701    for(i=0; i<h; i++){
2702        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2703        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2704        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2705        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2706        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2707        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2708        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2709        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2710        dst+=dstStride;
2711        src+=srcStride;
2712    }
2713}
2714
2715#if CONFIG_CAVS_DECODER
2716/* AVS specific */
2717void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2718
2719void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2720    put_pixels8_c(dst, src, stride, 8);
2721}
2722void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2723    avg_pixels8_c(dst, src, stride, 8);
2724}
2725void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2726    put_pixels16_c(dst, src, stride, 16);
2727}
2728void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2729    avg_pixels16_c(dst, src, stride, 16);
2730}
2731#endif /* CONFIG_CAVS_DECODER */
2732
2733#if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
2734/* VC-1 specific */
2735void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2736
2737void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2738    put_pixels8_c(dst, src, stride, 8);
2739}
2740#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2741
2742void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2743
2744/* H264 specific */
2745void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2746
2747#if CONFIG_RV30_DECODER
2748void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2749#endif /* CONFIG_RV30_DECODER */
2750
2751#if CONFIG_RV40_DECODER
2752static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2753    put_pixels16_xy2_c(dst, src, stride, 16);
2754}
2755static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2756    avg_pixels16_xy2_c(dst, src, stride, 16);
2757}
2758static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2759    put_pixels8_xy2_c(dst, src, stride, 8);
2760}
2761static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2762    avg_pixels8_xy2_c(dst, src, stride, 8);
2763}
2764
2765void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2766#endif /* CONFIG_RV40_DECODER */
2767
2768static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2769    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2770    int i;
2771
2772    for(i=0; i<w; i++){
2773        const int src_1= src[ -srcStride];
2774        const int src0 = src[0          ];
2775        const int src1 = src[  srcStride];
2776        const int src2 = src[2*srcStride];
2777        const int src3 = src[3*srcStride];
2778        const int src4 = src[4*srcStride];
2779        const int src5 = src[5*srcStride];
2780        const int src6 = src[6*srcStride];
2781        const int src7 = src[7*srcStride];
2782        const int src8 = src[8*srcStride];
2783        const int src9 = src[9*srcStride];
2784        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2785        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2786        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2787        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2788        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2789        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2790        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2791        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2792        src++;
2793        dst++;
2794    }
2795}
2796
2797static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2798    put_pixels8_c(dst, src, stride, 8);
2799}
2800
2801static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2802    uint8_t half[64];
2803    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2804    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2805}
2806
2807static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2808    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2809}
2810
2811static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2812    uint8_t half[64];
2813    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2814    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2815}
2816
2817static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2818    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2819}
2820
2821static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2822    uint8_t halfH[88];
2823    uint8_t halfV[64];
2824    uint8_t halfHV[64];
2825    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2826    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2827    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2828    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2829}
2830static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2831    uint8_t halfH[88];
2832    uint8_t halfV[64];
2833    uint8_t halfHV[64];
2834    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2835    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2836    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2837    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2838}
2839static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2840    uint8_t halfH[88];
2841    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2842    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2843}
2844
2845static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2846    if(CONFIG_ANY_H263) {
2847    int x;
2848    const int strength= ff_h263_loop_filter_strength[qscale];
2849
2850    for(x=0; x<8; x++){
2851        int d1, d2, ad1;
2852        int p0= src[x-2*stride];
2853        int p1= src[x-1*stride];
2854        int p2= src[x+0*stride];
2855        int p3= src[x+1*stride];
2856        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2857
2858        if     (d<-2*strength) d1= 0;
2859        else if(d<-  strength) d1=-2*strength - d;
2860        else if(d<   strength) d1= d;
2861        else if(d< 2*strength) d1= 2*strength - d;
2862        else                   d1= 0;
2863
2864        p1 += d1;
2865        p2 -= d1;
2866        if(p1&256) p1= ~(p1>>31);
2867        if(p2&256) p2= ~(p2>>31);
2868
2869        src[x-1*stride] = p1;
2870        src[x+0*stride] = p2;
2871
2872        ad1= FFABS(d1)>>1;
2873
2874        d2= av_clip((p0-p3)/4, -ad1, ad1);
2875
2876        src[x-2*stride] = p0 - d2;
2877        src[x+  stride] = p3 + d2;
2878    }
2879    }
2880}
2881
2882static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2883    if(CONFIG_ANY_H263) {
2884    int y;
2885    const int strength= ff_h263_loop_filter_strength[qscale];
2886
2887    for(y=0; y<8; y++){
2888        int d1, d2, ad1;
2889        int p0= src[y*stride-2];
2890        int p1= src[y*stride-1];
2891        int p2= src[y*stride+0];
2892        int p3= src[y*stride+1];
2893        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2894
2895        if     (d<-2*strength) d1= 0;
2896        else if(d<-  strength) d1=-2*strength - d;
2897        else if(d<   strength) d1= d;
2898        else if(d< 2*strength) d1= 2*strength - d;
2899        else                   d1= 0;
2900
2901        p1 += d1;
2902        p2 -= d1;
2903        if(p1&256) p1= ~(p1>>31);
2904        if(p2&256) p2= ~(p2>>31);
2905
2906        src[y*stride-1] = p1;
2907        src[y*stride+0] = p2;
2908
2909        ad1= FFABS(d1)>>1;
2910
2911        d2= av_clip((p0-p3)/4, -ad1, ad1);
2912
2913        src[y*stride-2] = p0 - d2;
2914        src[y*stride+1] = p3 + d2;
2915    }
2916    }
2917}
2918
2919static void h261_loop_filter_c(uint8_t *src, int stride){
2920    int x,y,xy,yz;
2921    int temp[64];
2922
2923    for(x=0; x<8; x++){
2924        temp[x      ] = 4*src[x           ];
2925        temp[x + 7*8] = 4*src[x + 7*stride];
2926    }
2927    for(y=1; y<7; y++){
2928        for(x=0; x<8; x++){
2929            xy = y * stride + x;
2930            yz = y * 8 + x;
2931            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2932        }
2933    }
2934
2935    for(y=0; y<8; y++){
2936        src[  y*stride] = (temp[  y*8] + 2)>>2;
2937        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2938        for(x=1; x<7; x++){
2939            xy = y * stride + x;
2940            yz = y * 8 + x;
2941            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2942        }
2943    }
2944}
2945
2946static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2947{
2948    int i, d;
2949    for( i = 0; i < 4; i++ ) {
2950        if( tc0[i] < 0 ) {
2951            pix += 4*ystride;
2952            continue;
2953        }
2954        for( d = 0; d < 4; d++ ) {
2955            const int p0 = pix[-1*xstride];
2956            const int p1 = pix[-2*xstride];
2957            const int p2 = pix[-3*xstride];
2958            const int q0 = pix[0];
2959            const int q1 = pix[1*xstride];
2960            const int q2 = pix[2*xstride];
2961
2962            if( FFABS( p0 - q0 ) < alpha &&
2963                FFABS( p1 - p0 ) < beta &&
2964                FFABS( q1 - q0 ) < beta ) {
2965
2966                int tc = tc0[i];
2967                int i_delta;
2968
2969                if( FFABS( p2 - p0 ) < beta ) {
2970                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2971                    tc++;
2972                }
2973                if( FFABS( q2 - q0 ) < beta ) {
2974                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2975                    tc++;
2976                }
2977
2978                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2979                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2980                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2981            }
2982            pix += ystride;
2983        }
2984    }
2985}
2986static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2987{
2988    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2989}
2990static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2991{
2992    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2993}
2994
2995static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2996{
2997    int d;
2998    for( d = 0; d < 16; d++ ) {
2999        const int p2 = pix[-3*xstride];
3000        const int p1 = pix[-2*xstride];
3001        const int p0 = pix[-1*xstride];
3002
3003        const int q0 = pix[ 0*xstride];
3004        const int q1 = pix[ 1*xstride];
3005        const int q2 = pix[ 2*xstride];
3006
3007        if( FFABS( p0 - q0 ) < alpha &&
3008            FFABS( p1 - p0 ) < beta &&
3009            FFABS( q1 - q0 ) < beta ) {
3010
3011            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3012                if( FFABS( p2 - p0 ) < beta)
3013                {
3014                    const int p3 = pix[-4*xstride];
3015                    /* p0', p1', p2' */
3016                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3017                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3018                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3019                } else {
3020                    /* p0' */
3021                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3022                }
3023                if( FFABS( q2 - q0 ) < beta)
3024                {
3025                    const int q3 = pix[3*xstride];
3026                    /* q0', q1', q2' */
3027                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3028                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3029                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3030                } else {
3031                    /* q0' */
3032                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3033                }
3034            }else{
3035                /* p0', q0' */
3036                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3037                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3038            }
3039        }
3040        pix += ystride;
3041    }
3042}
3043static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3044{
3045    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3046}
3047static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3048{
3049    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3050}
3051
3052static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3053{
3054    int i, d;
3055    for( i = 0; i < 4; i++ ) {
3056        const int tc = tc0[i];
3057        if( tc <= 0 ) {
3058            pix += 2*ystride;
3059            continue;
3060        }
3061        for( d = 0; d < 2; d++ ) {
3062            const int p0 = pix[-1*xstride];
3063            const int p1 = pix[-2*xstride];
3064            const int q0 = pix[0];
3065            const int q1 = pix[1*xstride];
3066
3067            if( FFABS( p0 - q0 ) < alpha &&
3068                FFABS( p1 - p0 ) < beta &&
3069                FFABS( q1 - q0 ) < beta ) {
3070
3071                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3072
3073                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3074                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3075            }
3076            pix += ystride;
3077        }
3078    }
3079}
3080static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3081{
3082    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3083}
3084static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3085{
3086    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3087}
3088
3089static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3090{
3091    int d;
3092    for( d = 0; d < 8; d++ ) {
3093        const int p0 = pix[-1*xstride];
3094        const int p1 = pix[-2*xstride];
3095        const int q0 = pix[0];
3096        const int q1 = pix[1*xstride];
3097
3098        if( FFABS( p0 - q0 ) < alpha &&
3099            FFABS( p1 - p0 ) < beta &&
3100            FFABS( q1 - q0 ) < beta ) {
3101
3102            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3103            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3104        }
3105        pix += ystride;
3106    }
3107}
3108static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3109{
3110    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3111}
3112static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3113{
3114    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3115}
3116
3117static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3118{
3119    int s, i;
3120
3121    s = 0;
3122    for(i=0;i<h;i++) {
3123        s += abs(pix1[0] - pix2[0]);
3124        s += abs(pix1[1] - pix2[1]);
3125        s += abs(pix1[2] - pix2[2]);
3126        s += abs(pix1[3] - pix2[3]);
3127        s += abs(pix1[4] - pix2[4]);
3128        s += abs(pix1[5] - pix2[5]);
3129        s += abs(pix1[6] - pix2[6]);
3130        s += abs(pix1[7] - pix2[7]);
3131        s += abs(pix1[8] - pix2[8]);
3132        s += abs(pix1[9] - pix2[9]);
3133        s += abs(pix1[10] - pix2[10]);
3134        s += abs(pix1[11] - pix2[11]);
3135        s += abs(pix1[12] - pix2[12]);
3136        s += abs(pix1[13] - pix2[13]);
3137        s += abs(pix1[14] - pix2[14]);
3138        s += abs(pix1[15] - pix2[15]);
3139        pix1 += line_size;
3140        pix2 += line_size;
3141    }
3142    return s;
3143}
3144
3145static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3146{
3147    int s, i;
3148
3149    s = 0;
3150    for(i=0;i<h;i++) {
3151        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3152        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3153        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3154        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3155        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3156        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3157        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3158        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3159        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3160        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3161        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3162        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3163        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3164        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3165        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3166        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3167        pix1 += line_size;
3168        pix2 += line_size;
3169    }
3170    return s;
3171}
3172
3173static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3174{
3175    int s, i;
3176    uint8_t *pix3 = pix2 + line_size;
3177
3178    s = 0;
3179    for(i=0;i<h;i++) {
3180        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3181        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3182        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3183        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3184        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3185        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3186        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3187        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3188        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3189        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3190        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3191        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3192        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3193        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3194        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3195        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3196        pix1 += line_size;
3197        pix2 += line_size;
3198        pix3 += line_size;
3199    }
3200    return s;
3201}
3202
3203static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3204{
3205    int s, i;
3206    uint8_t *pix3 = pix2 + line_size;
3207
3208    s = 0;
3209    for(i=0;i<h;i++) {
3210        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3211        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3212        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3213        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3214        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3215        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3216        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3217        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3218        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3219        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3220        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3221        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3222        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3223        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3224        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3225        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3226        pix1 += line_size;
3227        pix2 += line_size;
3228        pix3 += line_size;
3229    }
3230    return s;
3231}
3232
3233static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3234{
3235    int s, i;
3236
3237    s = 0;
3238    for(i=0;i<h;i++) {
3239        s += abs(pix1[0] - pix2[0]);
3240        s += abs(pix1[1] - pix2[1]);
3241        s += abs(pix1[2] - pix2[2]);
3242        s += abs(pix1[3] - pix2[3]);
3243        s += abs(pix1[4] - pix2[4]);
3244        s += abs(pix1[5] - pix2[5]);
3245        s += abs(pix1[6] - pix2[6]);
3246        s += abs(pix1[7] - pix2[7]);
3247        pix1 += line_size;
3248        pix2 += line_size;
3249    }
3250    return s;
3251}
3252
3253static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3254{
3255    int s, i;
3256
3257    s = 0;
3258    for(i=0;i<h;i++) {
3259        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3260        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3261        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3262        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3263        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3264        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3265        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3266        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3267        pix1 += line_size;
3268        pix2 += line_size;
3269    }
3270    return s;
3271}
3272
3273static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3274{
3275    int s, i;
3276    uint8_t *pix3 = pix2 + line_size;
3277
3278    s = 0;
3279    for(i=0;i<h;i++) {
3280        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3281        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3282        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3283        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3284        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3285        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3286        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3287        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3288        pix1 += line_size;
3289        pix2 += line_size;
3290        pix3 += line_size;
3291    }
3292    return s;
3293}
3294
3295static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3296{
3297    int s, i;
3298    uint8_t *pix3 = pix2 + line_size;
3299
3300    s = 0;
3301    for(i=0;i<h;i++) {
3302        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3303        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3304        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3305        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3306        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3307        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3308        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3309        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3310        pix1 += line_size;
3311        pix2 += line_size;
3312        pix3 += line_size;
3313    }
3314    return s;
3315}
3316
3317static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3318    MpegEncContext *c = v;
3319    int score1=0;
3320    int score2=0;
3321    int x,y;
3322
3323    for(y=0; y<h; y++){
3324        for(x=0; x<16; x++){
3325            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3326        }
3327        if(y+1<h){
3328            for(x=0; x<15; x++){
3329                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3330                             - s1[x+1] + s1[x+1+stride])
3331                        -FFABS(  s2[x  ] - s2[x  +stride]
3332                             - s2[x+1] + s2[x+1+stride]);
3333            }
3334        }
3335        s1+= stride;
3336        s2+= stride;
3337    }
3338
3339    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3340    else  return score1 + FFABS(score2)*8;
3341}
3342
3343static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3344    MpegEncContext *c = v;
3345    int score1=0;
3346    int score2=0;
3347    int x,y;
3348
3349    for(y=0; y<h; y++){
3350        for(x=0; x<8; x++){
3351            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3352        }
3353        if(y+1<h){
3354            for(x=0; x<7; x++){
3355                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3356                             - s1[x+1] + s1[x+1+stride])
3357                        -FFABS(  s2[x  ] - s2[x  +stride]
3358                             - s2[x+1] + s2[x+1+stride]);
3359            }
3360        }
3361        s1+= stride;
3362        s2+= stride;
3363    }
3364
3365    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3366    else  return score1 + FFABS(score2)*8;
3367}
3368
3369static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3370    int i;
3371    unsigned int sum=0;
3372
3373    for(i=0; i<8*8; i++){
3374        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3375        int w= weight[i];
3376        b>>= RECON_SHIFT;
3377        assert(-512<b && b<512);
3378
3379        sum += (w*b)*(w*b)>>4;
3380    }
3381    return sum>>2;
3382}
3383
3384static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3385    int i;
3386
3387    for(i=0; i<8*8; i++){
3388        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3389    }
3390}
3391
3392/**
3393 * permutes an 8x8 block.
3394 * @param block the block which will be permuted according to the given permutation vector
3395 * @param permutation the permutation vector
3396 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3397 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3398 *                  (inverse) permutated to scantable order!
3399 */
3400void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3401{
3402    int i;
3403    DCTELEM temp[64];
3404
3405    if(last<=0) return;
3406    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3407
3408    for(i=0; i<=last; i++){
3409        const int j= scantable[i];
3410        temp[j]= block[j];
3411        block[j]=0;
3412    }
3413
3414    for(i=0; i<=last; i++){
3415        const int j= scantable[i];
3416        const int perm_j= permutation[j];
3417        block[perm_j]= temp[j];
3418    }
3419}
3420
3421static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3422    return 0;
3423}
3424
3425void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3426    int i;
3427
3428    memset(cmp, 0, sizeof(void*)*6);
3429
3430    for(i=0; i<6; i++){
3431        switch(type&0xFF){
3432        case FF_CMP_SAD:
3433            cmp[i]= c->sad[i];
3434            break;
3435        case FF_CMP_SATD:
3436            cmp[i]= c->hadamard8_diff[i];
3437            break;
3438        case FF_CMP_SSE:
3439            cmp[i]= c->sse[i];
3440            break;
3441        case FF_CMP_DCT:
3442            cmp[i]= c->dct_sad[i];
3443            break;
3444        case FF_CMP_DCT264:
3445            cmp[i]= c->dct264_sad[i];
3446            break;
3447        case FF_CMP_DCTMAX:
3448            cmp[i]= c->dct_max[i];
3449            break;
3450        case FF_CMP_PSNR:
3451            cmp[i]= c->quant_psnr[i];
3452            break;
3453        case FF_CMP_BIT:
3454            cmp[i]= c->bit[i];
3455            break;
3456        case FF_CMP_RD:
3457            cmp[i]= c->rd[i];
3458            break;
3459        case FF_CMP_VSAD:
3460            cmp[i]= c->vsad[i];
3461            break;
3462        case FF_CMP_VSSE:
3463            cmp[i]= c->vsse[i];
3464            break;
3465        case FF_CMP_ZERO:
3466            cmp[i]= zero_cmp;
3467            break;
3468        case FF_CMP_NSSE:
3469            cmp[i]= c->nsse[i];
3470            break;
3471#if CONFIG_SNOW_ENCODER
3472        case FF_CMP_W53:
3473            cmp[i]= c->w53[i];
3474            break;
3475        case FF_CMP_W97:
3476            cmp[i]= c->w97[i];
3477            break;
3478#endif
3479        default:
3480            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3481        }
3482    }
3483}
3484
3485static void clear_block_c(DCTELEM *block)
3486{
3487    memset(block, 0, sizeof(DCTELEM)*64);
3488}
3489
3490/**
3491 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3492 */
3493static void clear_blocks_c(DCTELEM *blocks)
3494{
3495    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3496}
3497
3498static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3499    long i;
3500    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3501        long a = *(long*)(src+i);
3502        long b = *(long*)(dst+i);
3503        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3504    }
3505    for(; i<w; i++)
3506        dst[i+0] += src[i+0];
3507}
3508
3509static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3510    long i;
3511    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3512        long a = *(long*)(src1+i);
3513        long b = *(long*)(src2+i);
3514        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3515    }
3516    for(; i<w; i++)
3517        dst[i] = src1[i]+src2[i];
3518}
3519
3520static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3521    long i;
3522#if !HAVE_FAST_UNALIGNED
3523    if((long)src2 & (sizeof(long)-1)){
3524        for(i=0; i+7<w; i+=8){
3525            dst[i+0] = src1[i+0]-src2[i+0];
3526            dst[i+1] = src1[i+1]-src2[i+1];
3527            dst[i+2] = src1[i+2]-src2[i+2];
3528            dst[i+3] = src1[i+3]-src2[i+3];
3529            dst[i+4] = src1[i+4]-src2[i+4];
3530            dst[i+5] = src1[i+5]-src2[i+5];
3531            dst[i+6] = src1[i+6]-src2[i+6];
3532            dst[i+7] = src1[i+7]-src2[i+7];
3533        }
3534    }else
3535#endif
3536    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3537        long a = *(long*)(src1+i);
3538        long b = *(long*)(src2+i);
3539        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3540    }
3541    for(; i<w; i++)
3542        dst[i+0] = src1[i+0]-src2[i+0];
3543}
3544
3545static void add_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){
3546    int i;
3547    uint8_t l, lt;
3548
3549    l= *left;
3550    lt= *left_top;
3551
3552    for(i=0; i<w; i++){
3553        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3554        lt= src1[i];
3555        dst[i]= l;
3556    }
3557
3558    *left= l;
3559    *left_top= lt;
3560}
3561
3562static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3563    int i;
3564    uint8_t l, lt;
3565
3566    l= *left;
3567    lt= *left_top;
3568
3569    for(i=0; i<w; i++){
3570        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3571        lt= src1[i];
3572        l= src2[i];
3573        dst[i]= l - pred;
3574    }
3575
3576    *left= l;
3577    *left_top= lt;
3578}
3579
3580#define BUTTERFLY2(o1,o2,i1,i2) \
3581o1= (i1)+(i2);\
3582o2= (i1)-(i2);
3583
3584#define BUTTERFLY1(x,y) \
3585{\
3586    int a,b;\
3587    a= x;\
3588    b= y;\
3589    x= a+b;\
3590    y= a-b;\
3591}
3592
3593#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3594
3595static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3596    int i;
3597    int temp[64];
3598    int sum=0;
3599
3600    assert(h==8);
3601
3602    for(i=0; i<8; i++){
3603        //FIXME try pointer walks
3604        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3605        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3606        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3607        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3608
3609        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3610        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3611        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3612        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3613
3614        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3615        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3616        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3617        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3618    }
3619
3620    for(i=0; i<8; i++){
3621        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3622        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3623        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3624        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3625
3626        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3627        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3628        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3629        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3630
3631        sum +=
3632             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3633            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3634            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3635            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3636    }
3637#if 0
3638static int maxi=0;
3639if(sum>maxi){
3640    maxi=sum;
3641    printf("MAX:%d\n", maxi);
3642}
3643#endif
3644    return sum;
3645}
3646
3647static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3648    int i;
3649    int temp[64];
3650    int sum=0;
3651
3652    assert(h==8);
3653
3654    for(i=0; i<8; i++){
3655        //FIXME try pointer walks
3656        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3657        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3658        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3659        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3660
3661        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3662        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3663        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3664        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3665
3666        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3667        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3668        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3669        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3670    }
3671
3672    for(i=0; i<8; i++){
3673        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3674        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3675        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3676        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3677
3678        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3679        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3680        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3681        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3682
3683        sum +=
3684             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3685            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3686            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3687            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3688    }
3689
3690    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3691
3692    return sum;
3693}
3694
3695static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3696    MpegEncContext * const s= (MpegEncContext *)c;
3697    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3698    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3699
3700    assert(h==8);
3701
3702    s->dsp.diff_pixels(temp, src1, src2, stride);
3703    s->dsp.fdct(temp);
3704    return s->dsp.sum_abs_dctelem(temp);
3705}
3706
3707#if CONFIG_GPL
3708#define DCT8_1D {\
3709    const int s07 = SRC(0) + SRC(7);\
3710    const int s16 = SRC(1) + SRC(6);\
3711    const int s25 = SRC(2) + SRC(5);\
3712    const int s34 = SRC(3) + SRC(4);\
3713    const int a0 = s07 + s34;\
3714    const int a1 = s16 + s25;\
3715    const int a2 = s07 - s34;\
3716    const int a3 = s16 - s25;\
3717    const int d07 = SRC(0) - SRC(7);\
3718    const int d16 = SRC(1) - SRC(6);\
3719    const int d25 = SRC(2) - SRC(5);\
3720    const int d34 = SRC(3) - SRC(4);\
3721    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3722    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3723    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3724    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3725    DST(0,  a0 + a1     ) ;\
3726    DST(1,  a4 + (a7>>2)) ;\
3727    DST(2,  a2 + (a3>>1)) ;\
3728    DST(3,  a5 + (a6>>2)) ;\
3729    DST(4,  a0 - a1     ) ;\
3730    DST(5,  a6 - (a5>>2)) ;\
3731    DST(6, (a2>>1) - a3 ) ;\
3732    DST(7, (a4>>2) - a7 ) ;\
3733}
3734
3735static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3736    MpegEncContext * const s= (MpegEncContext *)c;
3737    DCTELEM dct[8][8];
3738    int i;
3739    int sum=0;
3740
3741    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3742
3743#define SRC(x) dct[i][x]
3744#define DST(x,v) dct[i][x]= v
3745    for( i = 0; i < 8; i++ )
3746        DCT8_1D
3747#undef SRC
3748#undef DST
3749
3750#define SRC(x) dct[x][i]
3751#define DST(x,v) sum += FFABS(v)
3752    for( i = 0; i < 8; i++ )
3753        DCT8_1D
3754#undef SRC
3755#undef DST
3756    return sum;
3757}
3758#endif
3759
3760static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3761    MpegEncContext * const s= (MpegEncContext *)c;
3762    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3763    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3764    int sum=0, i;
3765
3766    assert(h==8);
3767
3768    s->dsp.diff_pixels(temp, src1, src2, stride);
3769    s->dsp.fdct(temp);
3770
3771    for(i=0; i<64; i++)
3772        sum= FFMAX(sum, FFABS(temp[i]));
3773
3774    return sum;
3775}
3776
3777static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3778    MpegEncContext * const s= (MpegEncContext *)c;
3779    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3780    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3781    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3782    int sum=0, i;
3783
3784    assert(h==8);
3785    s->mb_intra=0;
3786
3787    s->dsp.diff_pixels(temp, src1, src2, stride);
3788
3789    memcpy(bak, temp, 64*sizeof(DCTELEM));
3790
3791    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3792    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3793    ff_simple_idct(temp); //FIXME
3794
3795    for(i=0; i<64; i++)
3796        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3797
3798    return sum;
3799}
3800
3801static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3802    MpegEncContext * const s= (MpegEncContext *)c;
3803    const uint8_t *scantable= s->intra_scantable.permutated;
3804    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3805    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3806    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3807    uint8_t * const bak= (uint8_t*)aligned_bak;
3808    int i, last, run, bits, level, distortion, start_i;
3809    const int esc_length= s->ac_esc_length;
3810    uint8_t * length;
3811    uint8_t * last_length;
3812
3813    assert(h==8);
3814
3815    for(i=0; i<8; i++){
3816        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3817        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3818    }
3819
3820    s->dsp.diff_pixels(temp, src1, src2, stride);
3821
3822    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3823
3824    bits=0;
3825
3826    if (s->mb_intra) {
3827        start_i = 1;
3828        length     = s->intra_ac_vlc_length;
3829        last_length= s->intra_ac_vlc_last_length;
3830        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3831    } else {
3832        start_i = 0;
3833        length     = s->inter_ac_vlc_length;
3834        last_length= s->inter_ac_vlc_last_length;
3835    }
3836
3837    if(last>=start_i){
3838        run=0;
3839        for(i=start_i; i<last; i++){
3840            int j= scantable[i];
3841            level= temp[j];
3842
3843            if(level){
3844                level+=64;
3845                if((level&(~127)) == 0){
3846                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3847                }else
3848                    bits+= esc_length;
3849                run=0;
3850            }else
3851                run++;
3852        }
3853        i= scantable[last];
3854
3855        level= temp[i] + 64;
3856
3857        assert(level - 64);
3858
3859        if((level&(~127)) == 0){
3860            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3861        }else
3862            bits+= esc_length;
3863
3864    }
3865
3866    if(last>=0){
3867        if(s->mb_intra)
3868            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3869        else
3870            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3871    }
3872
3873    s->dsp.idct_add(bak, stride, temp);
3874
3875    distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3876
3877    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3878}
3879
3880static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3881    MpegEncContext * const s= (MpegEncContext *)c;
3882    const uint8_t *scantable= s->intra_scantable.permutated;
3883    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3884    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3885    int i, last, run, bits, level, start_i;
3886    const int esc_length= s->ac_esc_length;
3887    uint8_t * length;
3888    uint8_t * last_length;
3889
3890    assert(h==8);
3891
3892    s->dsp.diff_pixels(temp, src1, src2, stride);
3893
3894    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3895
3896    bits=0;
3897
3898    if (s->mb_intra) {
3899        start_i = 1;
3900        length     = s->intra_ac_vlc_length;
3901        last_length= s->intra_ac_vlc_last_length;
3902        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3903    } else {
3904        start_i = 0;
3905        length     = s->inter_ac_vlc_length;
3906        last_length= s->inter_ac_vlc_last_length;
3907    }
3908
3909    if(last>=start_i){
3910        run=0;
3911        for(i=start_i; i<last; i++){
3912            int j= scantable[i];
3913            level= temp[j];
3914
3915            if(level){
3916                level+=64;
3917                if((level&(~127)) == 0){
3918                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3919                }else
3920                    bits+= esc_length;
3921                run=0;
3922            }else
3923                run++;
3924        }
3925        i= scantable[last];
3926
3927        level= temp[i] + 64;
3928
3929        assert(level - 64);
3930
3931        if((level&(~127)) == 0){
3932            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3933        }else
3934            bits+= esc_length;
3935    }
3936
3937    return bits;
3938}
3939
3940#define VSAD_INTRA(size) \
3941static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3942    int score=0;                                                                                            \
3943    int x,y;                                                                                                \
3944                                                                                                            \
3945    for(y=1; y<h; y++){                                                                                     \
3946        for(x=0; x<size; x+=4){                                                                             \
3947            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3948                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3949        }                                                                                                   \
3950        s+= stride;                                                                                         \
3951    }                                                                                                       \
3952                                                                                                            \
3953    return score;                                                                                           \
3954}
3955VSAD_INTRA(8)
3956VSAD_INTRA(16)
3957
3958static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3959    int score=0;
3960    int x,y;
3961
3962    for(y=1; y<h; y++){
3963        for(x=0; x<16; x++){
3964            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3965        }
3966        s1+= stride;
3967        s2+= stride;
3968    }
3969
3970    return score;
3971}
3972
3973#define SQ(a) ((a)*(a))
3974#define VSSE_INTRA(size) \
3975static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3976    int score=0;                                                                                            \
3977    int x,y;                                                                                                \
3978                                                                                                            \
3979    for(y=1; y<h; y++){                                                                                     \
3980        for(x=0; x<size; x+=4){                                                                               \
3981            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3982                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3983        }                                                                                                   \
3984        s+= stride;                                                                                         \
3985    }                                                                                                       \
3986                                                                                                            \
3987    return score;                                                                                           \
3988}
3989VSSE_INTRA(8)
3990VSSE_INTRA(16)
3991
3992static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3993    int score=0;
3994    int x,y;
3995
3996    for(y=1; y<h; y++){
3997        for(x=0; x<16; x++){
3998            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3999        }
4000        s1+= stride;
4001        s2+= stride;
4002    }
4003
4004    return score;
4005}
4006
4007static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4008                               int size){
4009    int score=0;
4010    int i;
4011    for(i=0; i<size; i++)
4012        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4013    return score;
4014}
4015
4016WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4017WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4018WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4019#if CONFIG_GPL
4020WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4021#endif
4022WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4023WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4024WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4025WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4026
4027static void vector_fmul_c(float *dst, const float *src, int len){
4028    int i;
4029    for(i=0; i<len; i++)
4030        dst[i] *= src[i];
4031}
4032
4033static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4034    int i;
4035    src1 += len-1;
4036    for(i=0; i<len; i++)
4037        dst[i] = src0[i] * src1[-i];
4038}
4039
4040void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
4041    int i;
4042    for(i=0; i<len; i++)
4043        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
4044}
4045
4046void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4047    int i,j;
4048    dst += len;
4049    win += len;
4050    src0+= len;
4051    for(i=-len, j=len-1; i<0; i++, j--) {
4052        float s0 = src0[i];
4053        float s1 = src1[j];
4054        float wi = win[i];
4055        float wj = win[j];
4056        dst[i] = s0*wj - s1*wi + add_bias;
4057        dst[j] = s0*wi + s1*wj + add_bias;
4058    }
4059}
4060
4061static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4062    int i;
4063    for(i=0; i<len; i++)
4064        dst[i] = src[i] * mul;
4065}
4066
4067static av_always_inline int float_to_int16_one(const float *src){
4068    int_fast32_t tmp = *(const int32_t*)src;
4069    if(tmp & 0xf0000){
4070        tmp = (0x43c0ffff - tmp)>>31;
4071        // is this faster on some gcc/cpu combinations?
4072//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4073//      else                 tmp = 0;
4074    }
4075    return tmp - 0x8000;
4076}
4077
4078void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4079    int i;
4080    for(i=0; i<len; i++)
4081        dst[i] = float_to_int16_one(src+i);
4082}
4083
4084void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4085    int i,j,c;
4086    if(channels==2){
4087        for(i=0; i<len; i++){
4088            dst[2*i]   = float_to_int16_one(src[0]+i);
4089            dst[2*i+1] = float_to_int16_one(src[1]+i);
4090        }
4091    }else{
4092        for(c=0; c<channels; c++)
4093            for(i=0, j=c; i<len; i++, j+=channels)
4094                dst[j] = float_to_int16_one(src[c]+i);
4095    }
4096}
4097
4098static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4099{
4100    while (order--)
4101       *v1++ += *v2++;
4102}
4103
4104static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4105{
4106    while (order--)
4107        *v1++ -= *v2++;
4108}
4109
4110static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4111{
4112    int res = 0;
4113
4114    while (order--)
4115        res += (*v1++ * *v2++) >> shift;
4116
4117    return res;
4118}
4119
4120#define W0 2048
4121#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4122#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4123#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4124#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4125#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4126#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4127#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4128
4129static void wmv2_idct_row(short * b)
4130{
4131    int s1,s2;
4132    int a0,a1,a2,a3,a4,a5,a6,a7;
4133    /*step 1*/
4134    a1 = W1*b[1]+W7*b[7];
4135    a7 = W7*b[1]-W1*b[7];
4136    a5 = W5*b[5]+W3*b[3];
4137    a3 = W3*b[5]-W5*b[3];
4138    a2 = W2*b[2]+W6*b[6];
4139    a6 = W6*b[2]-W2*b[6];
4140    a0 = W0*b[0]+W0*b[4];
4141    a4 = W0*b[0]-W0*b[4];
4142    /*step 2*/
4143    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4144    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4145    /*step 3*/
4146    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4147    b[1] = (a4+a6 +s1   + (1<<7))>>8;
4148    b[2] = (a4-a6 +s2   + (1<<7))>>8;
4149    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4150    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4151    b[5] = (a4-a6 -s2   + (1<<7))>>8;
4152    b[6] = (a4+a6 -s1   + (1<<7))>>8;
4153    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4154}
4155static void wmv2_idct_col(short * b)
4156{
4157    int s1,s2;
4158    int a0,a1,a2,a3,a4,a5,a6,a7;
4159    /*step 1, with extended precision*/
4160    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4161    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4162    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4163    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4164    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4165    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4166    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4167    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4168    /*step 2*/
4169    s1 = (181*(a1-a5+a7-a3)+128)>>8;
4170    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4171    /*step 3*/
4172    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4173    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4174    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4175    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4176
4177    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4178    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4179    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4180    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4181}
4182void ff_wmv2_idct_c(short * block){
4183    int i;
4184
4185    for(i=0;i<64;i+=8){
4186        wmv2_idct_row(block+i);
4187    }
4188    for(i=0;i<8;i++){
4189        wmv2_idct_col(block+i);
4190    }
4191}
4192/* XXX: those functions should be suppressed ASAP when all IDCTs are
4193 converted */
4194static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4195{
4196    ff_wmv2_idct_c(block);
4197    put_pixels_clamped_c(block, dest, line_size);
4198}
4199static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4200{
4201    ff_wmv2_idct_c(block);
4202    add_pixels_clamped_c(block, dest, line_size);
4203}
4204static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4205{
4206    j_rev_dct (block);
4207    put_pixels_clamped_c(block, dest, line_size);
4208}
4209static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4210{
4211    j_rev_dct (block);
4212    add_pixels_clamped_c(block, dest, line_size);
4213}
4214
4215static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4216{
4217    j_rev_dct4 (block);
4218    put_pixels_clamped4_c(block, dest, line_size);
4219}
4220static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4221{
4222    j_rev_dct4 (block);
4223    add_pixels_clamped4_c(block, dest, line_size);
4224}
4225
4226static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4227{
4228    j_rev_dct2 (block);
4229    put_pixels_clamped2_c(block, dest, line_size);
4230}
4231static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4232{
4233    j_rev_dct2 (block);
4234    add_pixels_clamped2_c(block, dest, line_size);
4235}
4236
4237static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4238{
4239    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4240
4241    dest[0] = cm[(block[0] + 4)>>3];
4242}
4243static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4244{
4245    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4246
4247    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4248}
4249
4250static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4251
4252/* init static data */
4253void dsputil_static_init(void)
4254{
4255    int i;
4256
4257    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4258    for(i=0;i<MAX_NEG_CROP;i++) {
4259        ff_cropTbl[i] = 0;
4260        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4261    }
4262
4263    for(i=0;i<512;i++) {
4264        ff_squareTbl[i] = (i - 256) * (i - 256);
4265    }
4266
4267    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4268}
4269
4270int ff_check_alignment(void){
4271    static int did_fail=0;
4272    DECLARE_ALIGNED_16(int, aligned);
4273
4274    if((long)&aligned & 15){
4275        if(!did_fail){
4276#if HAVE_MMX || HAVE_ALTIVEC
4277            av_log(NULL, AV_LOG_ERROR,
4278                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4279                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4280                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4281                "Do not report crashes to FFmpeg developers.\n");
4282#endif
4283            did_fail=1;
4284        }
4285        return -1;
4286    }
4287    return 0;
4288}
4289
4290void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4291{
4292    int i;
4293
4294    ff_check_alignment();
4295
4296#if CONFIG_ENCODERS
4297    if(avctx->dct_algo==FF_DCT_FASTINT) {
4298        c->fdct = fdct_ifast;
4299        c->fdct248 = fdct_ifast248;
4300    }
4301    else if(avctx->dct_algo==FF_DCT_FAAN) {
4302        c->fdct = ff_faandct;
4303        c->fdct248 = ff_faandct248;
4304    }
4305    else {
4306        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4307        c->fdct248 = ff_fdct248_islow;
4308    }
4309#endif //CONFIG_ENCODERS
4310
4311    if(avctx->lowres==1){
4312        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4313            c->idct_put= ff_jref_idct4_put;
4314            c->idct_add= ff_jref_idct4_add;
4315        }else{
4316            c->idct_put= ff_h264_lowres_idct_put_c;
4317            c->idct_add= ff_h264_lowres_idct_add_c;
4318        }
4319        c->idct    = j_rev_dct4;
4320        c->idct_permutation_type= FF_NO_IDCT_PERM;
4321    }else if(avctx->lowres==2){
4322        c->idct_put= ff_jref_idct2_put;
4323        c->idct_add= ff_jref_idct2_add;
4324        c->idct    = j_rev_dct2;
4325        c->idct_permutation_type= FF_NO_IDCT_PERM;
4326    }else if(avctx->lowres==3){
4327        c->idct_put= ff_jref_idct1_put;
4328        c->idct_add= ff_jref_idct1_add;
4329        c->idct    = j_rev_dct1;
4330        c->idct_permutation_type= FF_NO_IDCT_PERM;
4331    }else{
4332        if(avctx->idct_algo==FF_IDCT_INT){
4333            c->idct_put= ff_jref_idct_put;
4334            c->idct_add= ff_jref_idct_add;
4335            c->idct    = j_rev_dct;
4336            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4337        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_THEORA_DECODER ) &&
4338                avctx->idct_algo==FF_IDCT_VP3){
4339            c->idct_put= ff_vp3_idct_put_c;
4340            c->idct_add= ff_vp3_idct_add_c;
4341            c->idct    = ff_vp3_idct_c;
4342            c->idct_permutation_type= FF_NO_IDCT_PERM;
4343        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4344            c->idct_put= ff_wmv2_idct_put_c;
4345            c->idct_add= ff_wmv2_idct_add_c;
4346            c->idct    = ff_wmv2_idct_c;
4347            c->idct_permutation_type= FF_NO_IDCT_PERM;
4348        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4349            c->idct_put= ff_faanidct_put;
4350            c->idct_add= ff_faanidct_add;
4351            c->idct    = ff_faanidct;
4352            c->idct_permutation_type= FF_NO_IDCT_PERM;
4353        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4354            c->idct_put= ff_ea_idct_put_c;
4355            c->idct_permutation_type= FF_NO_IDCT_PERM;
4356        }else{ //accurate/default
4357            c->idct_put= ff_simple_idct_put;
4358            c->idct_add= ff_simple_idct_add;
4359            c->idct    = ff_simple_idct;
4360            c->idct_permutation_type= FF_NO_IDCT_PERM;
4361        }
4362    }
4363
4364    if (CONFIG_H264_DECODER) {
4365        c->h264_idct_add= ff_h264_idct_add_c;
4366        c->h264_idct8_add= ff_h264_idct8_add_c;
4367        c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4368        c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4369        c->h264_idct_add16     = ff_h264_idct_add16_c;
4370        c->h264_idct8_add4     = ff_h264_idct8_add4_c;
4371        c->h264_idct_add8      = ff_h264_idct_add8_c;
4372        c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4373    }
4374
4375    c->get_pixels = get_pixels_c;
4376    c->diff_pixels = diff_pixels_c;
4377    c->put_pixels_clamped = put_pixels_clamped_c;
4378    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4379    c->add_pixels_clamped = add_pixels_clamped_c;
4380    c->add_pixels8 = add_pixels8_c;
4381    c->add_pixels4 = add_pixels4_c;
4382    c->sum_abs_dctelem = sum_abs_dctelem_c;
4383    c->gmc1 = gmc1_c;
4384    c->gmc = ff_gmc_c;
4385    c->clear_block = clear_block_c;
4386    c->clear_blocks = clear_blocks_c;
4387    c->pix_sum = pix_sum_c;
4388    c->pix_norm1 = pix_norm1_c;
4389
4390    /* TODO [0] 16  [1] 8 */
4391    c->pix_abs[0][0] = pix_abs16_c;
4392    c->pix_abs[0][1] = pix_abs16_x2_c;
4393    c->pix_abs[0][2] = pix_abs16_y2_c;
4394    c->pix_abs[0][3] = pix_abs16_xy2_c;
4395    c->pix_abs[1][0] = pix_abs8_c;
4396    c->pix_abs[1][1] = pix_abs8_x2_c;
4397    c->pix_abs[1][2] = pix_abs8_y2_c;
4398    c->pix_abs[1][3] = pix_abs8_xy2_c;
4399
4400#define dspfunc(PFX, IDX, NUM) \
4401    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4402    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4403    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4404    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4405
4406    dspfunc(put, 0, 16);
4407    dspfunc(put_no_rnd, 0, 16);
4408    dspfunc(put, 1, 8);
4409    dspfunc(put_no_rnd, 1, 8);
4410    dspfunc(put, 2, 4);
4411    dspfunc(put, 3, 2);
4412
4413    dspfunc(avg, 0, 16);
4414    dspfunc(avg_no_rnd, 0, 16);
4415    dspfunc(avg, 1, 8);
4416    dspfunc(avg_no_rnd, 1, 8);
4417    dspfunc(avg, 2, 4);
4418    dspfunc(avg, 3, 2);
4419#undef dspfunc
4420
4421    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4422    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4423
4424    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4425    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4426    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4427    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4428    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4429    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4430    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4431    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4432    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4433
4434    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4435    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4436    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4437    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4438    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4439    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4440    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4441    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4442    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4443
4444#define dspfunc(PFX, IDX, NUM) \
4445    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4446    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4447    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4448    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4449    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4450    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4451    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4452    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4453    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4454    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4455    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4456    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4457    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4458    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4459    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4460    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4461
4462    dspfunc(put_qpel, 0, 16);
4463    dspfunc(put_no_rnd_qpel, 0, 16);
4464
4465    dspfunc(avg_qpel, 0, 16);
4466    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4467
4468    dspfunc(put_qpel, 1, 8);
4469    dspfunc(put_no_rnd_qpel, 1, 8);
4470
4471    dspfunc(avg_qpel, 1, 8);
4472    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4473
4474    dspfunc(put_h264_qpel, 0, 16);
4475    dspfunc(put_h264_qpel, 1, 8);
4476    dspfunc(put_h264_qpel, 2, 4);
4477    dspfunc(put_h264_qpel, 3, 2);
4478    dspfunc(avg_h264_qpel, 0, 16);
4479    dspfunc(avg_h264_qpel, 1, 8);
4480    dspfunc(avg_h264_qpel, 2, 4);
4481
4482#undef dspfunc
4483    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4484    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4485    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4486    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4487    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4488    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4489    c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4490
4491    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4492    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4493    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4494    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4495    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4496    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4497    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4498    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4499    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4500    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4501    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4502    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4503    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4504    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4505    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4506    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4507    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4508    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4509    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4510    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4511
4512    c->draw_edges = draw_edges_c;
4513
4514#if CONFIG_CAVS_DECODER
4515    ff_cavsdsp_init(c,avctx);
4516#endif
4517#if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
4518    ff_vc1dsp_init(c,avctx);
4519#endif
4520#if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
4521    ff_intrax8dsp_init(c,avctx);
4522#endif
4523#if CONFIG_RV30_DECODER
4524    ff_rv30dsp_init(c,avctx);
4525#endif
4526#if CONFIG_RV40_DECODER
4527    ff_rv40dsp_init(c,avctx);
4528    c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4529    c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4530    c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4531    c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4532#endif
4533
4534    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4535    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4536    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4537    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4538    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4539    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4540    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4541    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4542
4543#define SET_CMP_FUNC(name) \
4544    c->name[0]= name ## 16_c;\
4545    c->name[1]= name ## 8x8_c;
4546
4547    SET_CMP_FUNC(hadamard8_diff)
4548    c->hadamard8_diff[4]= hadamard8_intra16_c;
4549    c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4550    SET_CMP_FUNC(dct_sad)
4551    SET_CMP_FUNC(dct_max)
4552#if CONFIG_GPL
4553    SET_CMP_FUNC(dct264_sad)
4554#endif
4555    c->sad[0]= pix_abs16_c;
4556    c->sad[1]= pix_abs8_c;
4557    c->sse[0]= sse16_c;
4558    c->sse[1]= sse8_c;
4559    c->sse[2]= sse4_c;
4560    SET_CMP_FUNC(quant_psnr)
4561    SET_CMP_FUNC(rd)
4562    SET_CMP_FUNC(bit)
4563    c->vsad[0]= vsad16_c;
4564    c->vsad[4]= vsad_intra16_c;
4565    c->vsad[5]= vsad_intra8_c;
4566    c->vsse[0]= vsse16_c;
4567    c->vsse[4]= vsse_intra16_c;
4568    c->vsse[5]= vsse_intra8_c;
4569    c->nsse[0]= nsse16_c;
4570    c->nsse[1]= nsse8_c;
4571#if CONFIG_SNOW_ENCODER
4572    c->w53[0]= w53_16_c;
4573    c->w53[1]= w53_8_c;
4574    c->w97[0]= w97_16_c;
4575    c->w97[1]= w97_8_c;
4576#endif
4577
4578    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4579
4580    c->add_bytes= add_bytes_c;
4581    c->add_bytes_l2= add_bytes_l2_c;
4582    c->diff_bytes= diff_bytes_c;
4583    c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4584    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4585    c->bswap_buf= bswap_buf;
4586#if CONFIG_PNG_DECODER
4587    c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4588#endif
4589
4590    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4591    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4592    c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4593    c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4594    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4595    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4596    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4597    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4598    c->h264_loop_filter_strength= NULL;
4599
4600    if (CONFIG_ANY_H263) {
4601        c->h263_h_loop_filter= h263_h_loop_filter_c;
4602        c->h263_v_loop_filter= h263_v_loop_filter_c;
4603    }
4604
4605    if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) {
4606        c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4607        c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4608    }
4609    if (CONFIG_VP6_DECODER) {
4610        c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4611    }
4612
4613    c->h261_loop_filter= h261_loop_filter_c;
4614
4615    c->try_8x8basis= try_8x8basis_c;
4616    c->add_8x8basis= add_8x8basis_c;
4617
4618#if CONFIG_SNOW_DECODER
4619    c->vertical_compose97i = ff_snow_vertical_compose97i;
4620    c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4621    c->inner_add_yblock = ff_snow_inner_add_yblock;
4622#endif
4623
4624#if CONFIG_VORBIS_DECODER
4625    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4626#endif
4627#if CONFIG_AC3_DECODER
4628    c->ac3_downmix = ff_ac3_downmix_c;
4629#endif
4630#if CONFIG_FLAC_ENCODER
4631    c->flac_compute_autocorr = ff_flac_compute_autocorr;
4632#endif
4633    c->vector_fmul = vector_fmul_c;
4634    c->vector_fmul_reverse = vector_fmul_reverse_c;
4635    c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4636    c->vector_fmul_window = ff_vector_fmul_window_c;
4637    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4638    c->float_to_int16 = ff_float_to_int16_c;
4639    c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4640    c->add_int16 = add_int16_c;
4641    c->sub_int16 = sub_int16_c;
4642    c->scalarproduct_int16 = scalarproduct_int16_c;
4643
4644    c->shrink[0]= ff_img_copy_plane;
4645    c->shrink[1]= ff_shrink22;
4646    c->shrink[2]= ff_shrink44;
4647    c->shrink[3]= ff_shrink88;
4648
4649    c->prefetch= just_return;
4650
4651    memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4652    memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4653
4654    if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4655    if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4656    if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4657    if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4658    if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4659    if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4660    if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4661    if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4662    if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4663
4664    for(i=0; i<64; i++){
4665        if(!c->put_2tap_qpel_pixels_tab[0][i])
4666            c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4667        if(!c->avg_2tap_qpel_pixels_tab[0][i])
4668            c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4669    }
4670
4671    switch(c->idct_permutation_type){
4672    case FF_NO_IDCT_PERM:
4673        for(i=0; i<64; i++)
4674            c->idct_permutation[i]= i;
4675        break;
4676    case FF_LIBMPEG2_IDCT_PERM:
4677        for(i=0; i<64; i++)
4678            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4679        break;
4680    case FF_SIMPLE_IDCT_PERM:
4681        for(i=0; i<64; i++)
4682            c->idct_permutation[i]= simple_mmx_permutation[i];
4683        break;
4684    case FF_TRANSPOSE_IDCT_PERM:
4685        for(i=0; i<64; i++)
4686            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4687        break;
4688    case FF_PARTTRANS_IDCT_PERM:
4689        for(i=0; i<64; i++)
4690            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4691        break;
4692    case FF_SSE2_IDCT_PERM:
4693        for(i=0; i<64; i++)
4694            c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4695        break;
4696    default:
4697        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4698    }
4699}
4700
4701